diff --git a/Android.bp b/Android.bp
index d033d2d04e..a4ca5c79e8 100644
--- a/Android.bp
+++ b/Android.bp
@@ -31,7 +31,8 @@ arm_compute_library_defaults {
             "-DARM_COMPUTE_CPP_SCHEDULER",
             "-Wno-unused-parameter",
             "-DNO_DOT_IN_TOOLCHAIN",
-            "-no-integrated-as"
+            "-no-integrated-as",
+            "-Wno-implicit-fallthrough"
     ],
     rtti: true,
 }
@@ -42,10 +43,11 @@ cc_library_static {
     proprietary: true,
     local_include_dirs: ["build/android-arm64v8a/src/core",
                          "build/android-arm64v8a/src/core/CL",
-                         "arm_compute/core/NEON/kernels/assembly",
-                         "arm_compute/core/NEON/kernels/convolution/common",
-                         "arm_compute/core/NEON/kernels/convolution/depthwise",
+                         "src/core/common",
+                         "src/core/helpers",
                          "src/core/NEON/kernels/assembly",
+                         "src/core/NEON/kernels/convolution/common",
+                         "src/core/NEON/kernels/convolution/depthwise",
                          "src/core/NEON/kernels/convolution/winograd"],
     export_include_dirs: [".", "./include"],
     srcs: [
@@ -129,11 +131,9 @@ cc_library_static {
         "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp",
+        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp",
         "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp",
         "src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp",
@@ -261,7 +261,6 @@ cc_library_static {
         "src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp",
         "src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp",
         "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp",
-        "src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp",
         "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp",
         "src/core/NEON/kernels/NEDequantizationLayerKernel.cpp",
         "src/core/NEON/kernels/NEDerivativeKernel.cpp",
@@ -308,6 +307,7 @@ cc_library_static {
         "src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp",
         "src/core/NEON/kernels/NELKTrackerKernel.cpp",
         "src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp",
+        "src/core/NEON/kernels/NELogicalKernel.cpp",
         "src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp",
         "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp",
         "src/core/NEON/kernels/NEMeanStdDevKernel.cpp",
@@ -355,6 +355,11 @@ cc_library_static {
         "src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp",
         "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp",
         "src/core/NEON/kernels/NEYOLOLayerKernel.cpp",
+        "src/core/NEON/kernels/activation/impl/fp16_neon_activation.cpp",
+        "src/core/NEON/kernels/activation/impl/fp32_neon_activation.cpp",
+        "src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp",
+        "src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp",
+        "src/core/NEON/kernels/activation/impl/qsymm16_neon_activation.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp",
@@ -364,10 +369,12 @@ cc_library_static {
         "src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp",
+        "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
         "src/core/NEON/kernels/arm_gemm/mergeresults.cpp",
         "src/core/NEON/kernels/arm_gemm/misc.cpp",
         "src/core/NEON/kernels/arm_gemm/quantized.cpp",
-        "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp",
+        "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
+        "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp",
         "src/core/NEON/kernels/convolution/common/padding.cpp",
         "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
         "src/core/NEON/kernels/convolution/common/qsymm8.cpp",
@@ -405,6 +412,8 @@ cc_library_static {
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp",
+        "src/core/NEON/kernels/floor/impl/fp16_neon_floor.cpp",
+        "src/core/NEON/kernels/floor/impl/fp32_neon_floor.cpp",
         "src/core/PyramidInfo.cpp",
         "src/core/Rounding.cpp",
         "src/core/Size2D.cpp",
@@ -413,6 +422,8 @@ cc_library_static {
         "src/core/Utils.cpp",
         "src/core/Validate.cpp",
         "src/core/Version.cpp",
+        "src/core/helpers/SoftmaxHelpers.cpp",
+        "src/core/helpers/WindowHelpers.cpp",
         "src/core/utils/ScaleUtils.cpp",
         "src/core/utils/helpers/fft.cpp",
         "src/core/utils/helpers/tensor_transform.cpp",
@@ -520,6 +531,9 @@ cc_library_static {
         "src/runtime/CL/functions/CLLaplacianPyramid.cpp",
         "src/runtime/CL/functions/CLLaplacianReconstruct.cpp",
         "src/runtime/CL/functions/CLLocallyConnectedLayer.cpp",
+        "src/runtime/CL/functions/CLLogicalAnd.cpp",
+        "src/runtime/CL/functions/CLLogicalNot.cpp",
+        "src/runtime/CL/functions/CLLogicalOr.cpp",
         "src/runtime/CL/functions/CLMagnitude.cpp",
         "src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp",
         "src/runtime/CL/functions/CLMeanStdDev.cpp",
@@ -662,9 +676,9 @@ cc_library_static {
         "src/runtime/NEON/functions/NEFuseBatchNormalization.cpp",
         "src/runtime/NEON/functions/NEGEMM.cpp",
         "src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp",
+        "src/runtime/NEON/functions/NEGEMMConv2d.cpp",
         "src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp",
         "src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp",
-        "src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp",
         "src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp",
         "src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp",
         "src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp",
@@ -688,6 +702,7 @@ cc_library_static {
         "src/runtime/NEON/functions/NELaplacianPyramid.cpp",
         "src/runtime/NEON/functions/NELaplacianReconstruct.cpp",
         "src/runtime/NEON/functions/NELocallyConnectedLayer.cpp",
+        "src/runtime/NEON/functions/NELogical.cpp",
         "src/runtime/NEON/functions/NEMagnitude.cpp",
         "src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp",
         "src/runtime/NEON/functions/NEMeanStdDev.cpp",
@@ -720,7 +735,6 @@ cc_library_static {
         "src/runtime/NEON/functions/NEScale.cpp",
         "src/runtime/NEON/functions/NEScharr3x3.cpp",
         "src/runtime/NEON/functions/NESelect.cpp",
-        "src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp",
         "src/runtime/NEON/functions/NESlice.cpp",
         "src/runtime/NEON/functions/NESobel3x3.cpp",
         "src/runtime/NEON/functions/NESobel5x5.cpp",
@@ -751,6 +765,7 @@ cc_library_static {
         "src/runtime/RuntimeContext.cpp",
         "src/runtime/Scheduler.cpp",
         "src/runtime/SchedulerFactory.cpp",
+        "src/runtime/SchedulerUtils.cpp",
         "src/runtime/SubTensor.cpp",
         "src/runtime/Tensor.cpp",
         "src/runtime/TensorAllocator.cpp",
@@ -771,69 +786,71 @@ cc_library_static {
         },
         arm64: {
             srcs: [
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp",
                 
             ],
         },
diff --git a/README.md b/README.md
index f6355de14c..0aca596d3d 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,10 @@ Please report issues here: https://github.com/ARM-software/ComputeLibrary/issues
 
 **Make sure you are using the latest version of the library before opening an issue. Thanks**
 
+Deprecation notice:
+- We have deprecated the GLES backend and it will be removed from the library in the release 21.05
+- We have deprecated the NEON and OpenCL computer vision functions and they will be removed in 21.05
+
 News:
 
 - [Gian Marco's talk on Performance Analysis for Optimizing Embedded Deep Learning Inference Software](https://www.embedded-vision.com/platinum-members/arm/embedded-vision-training/videos/pages/may-2019-embedded-vision-summit)
@@ -44,6 +48,3 @@ To indicate that you agree to the the terms of the DCO, you "sign off" your cont
 ```Signed-off-by: John Doe <john.doe@example.org>```
 
 You must use your real name, no pseudonyms or anonymous contributions are accepted.
-
-### Security Issues
-If you believe you have discovered a security issue please contact MLG-Security@arm.com
diff --git a/SConscript b/SConscript
index f19122c703..bc4aa4ccb7 100644
--- a/SConscript
+++ b/SConscript
@@ -24,8 +24,8 @@ import os.path
 import re
 import subprocess
 
-VERSION = "v20.08"
-LIBRARY_VERSION_MAJOR = 20
+VERSION = "v20.11"
+LIBRARY_VERSION_MAJOR = 21
 LIBRARY_VERSION_MINOR =  0
 LIBRARY_VERSION_PATCH =  0
 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)
@@ -48,20 +48,6 @@ def build_library(name, sources, static=False, libs=[]):
     else:
         if env['set_soname']:
             obj = arm_compute_env.SharedLibrary(name, source=sources, SHLIBVERSION = SONAME_VERSION, LIBS = arm_compute_env["LIBS"] + libs)
-
-            symlinks = []
-            # Manually delete symlinks or SCons will get confused:
-            directory = os.path.dirname(obj[0].path)
-            library_prefix = obj[0].path[:-(1 + len(SONAME_VERSION))]
-            real_lib = "%s.%s" % (library_prefix, SONAME_VERSION)
-
-            for f in Glob("#%s.*" % library_prefix):
-                if str(f) != real_lib:
-                    symlinks.append("%s/%s" % (directory,str(f)))
-
-            clean = arm_compute_env.Command('clean-%s' % str(obj[0]), [], Delete(symlinks))
-            Default(clean)
-            Depends(obj, clean)
         else:
             obj = arm_compute_env.SharedLibrary(name, source=sources, LIBS = arm_compute_env["LIBS"] + libs)
 
@@ -178,6 +164,7 @@ arm_compute_env.Append(LIBS = ['dl'])
 core_files = Glob('src/core/*.cpp')
 core_files += Glob('src/core/CPP/*.cpp')
 core_files += Glob('src/core/CPP/kernels/*.cpp')
+core_files += Glob('src/core/helpers/*.cpp')
 core_files += Glob('src/core/utils/*.cpp')
 core_files += Glob('src/core/utils/helpers/*.cpp')
 core_files += Glob('src/core/utils/io/*.cpp')
@@ -228,11 +215,10 @@ if env['neon']:
     # build winograd/depthwise sources for either v7a / v8a
     core_files += Glob('src/core/NEON/kernels/convolution/*/*.cpp')
     core_files += Glob('src/core/NEON/kernels/convolution/winograd/*/*.cpp')
-    arm_compute_env.Append(CPPPATH = ["arm_compute/core/NEON/kernels/convolution/common/",
-                                      "arm_compute/core/NEON/kernels/convolution/winograd/",
-                                      "arm_compute/core/NEON/kernels/convolution/depthwise/",
-                                      "src/core/NEON/kernels/assembly/",
+    arm_compute_env.Append(CPPPATH = ["src/core/NEON/kernels/convolution/common/",
                                       "src/core/NEON/kernels/convolution/winograd/",
+                                      "src/core/NEON/kernels/convolution/depthwise/",
+                                      "src/core/NEON/kernels/assembly/",
                                       "arm_compute/core/NEON/kernels/assembly/"])
 
     graph_files += Glob('src/graph/backends/NEON/*.cpp')
@@ -245,6 +231,17 @@ if env['neon']:
         if "sve" in env['arch']:
              core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/sve_*/*.cpp')
 
+    if any(i in env['data_type_support'] for i in ['all', 'fp16']):
+        core_files += Glob('src/core/NEON/kernels/*/impl/fp16_*.cpp')
+    if any(i in env['data_type_support'] for i in ['all', 'fp32']):
+        core_files += Glob('src/core/NEON/kernels/*/impl/fp32_*.cpp')
+    if any(i in env['data_type_support'] for i in ['all', 'qasymm8']):
+        core_files += Glob('src/core/NEON/kernels/*/impl/qasymm8_neon*.cpp')
+    if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']):
+        core_files += Glob('src/core/NEON/kernels/*/impl/qasymm8_signed_*.cpp')
+    if any(i in env['data_type_support'] for i in ['all', 'qsymm16']):
+        core_files += Glob('src/core/NEON/kernels/*/impl/qsymm16_*.cpp')
+
     runtime_files += Glob('src/runtime/NEON/*.cpp')
     runtime_files += Glob('src/runtime/NEON/functions/*.cpp')
     runtime_files += Glob('src/runtime/NEON/functions/assembly/*.cpp')
diff --git a/SConstruct b/SConstruct
index 373e561401..3b2be11766 100644
--- a/SConstruct
+++ b/SConstruct
@@ -25,12 +25,14 @@ import os
 import subprocess
 
 def version_at_least(version, required):
-    end = min(len(version), len(required))
 
-    for i in range(0, end, 2):
-        if int(version[i]) < int(required[i]):
+    version_list = version.split('.')
+    required_list = required.split('.')
+    end = min(len(version_list), len(required_list))
+    for i in range(0, end):
+        if int(version_list[i]) < int(required_list[i]):
             return False
-        elif int(version[i]) > int(required[i]):
+        elif int(version_list[i]) > int(required_list[i]):
             return True
 
     return True
@@ -42,7 +44,7 @@ vars.AddVariables(
     BoolVariable("logging", "Logging (this flag is forced to 1 for debug=1)", False),
     EnumVariable("arch", "Target Architecture", "armv7a",
                   allowed_values=("armv7a", "arm64-v8a", "arm64-v8.2-a", "arm64-v8.2-a-sve", "x86_32", "x86_64",
-                                  "armv8a", "armv8.2-a", "armv8.2-a-sve", "armv8.6-a", "x86")),
+                                  "armv8a", "armv8.2-a", "armv8.2-a-sve", "armv8.6-a", "armv8.6-a-sve", "x86")),
     EnumVariable("estate", "Execution State", "auto", allowed_values=("auto", "32", "64")),
     EnumVariable("os", "Target OS", "linux", allowed_values=("linux", "android", "tizen", "bare_metal")),
     EnumVariable("build", "Build type", "cross_compile", allowed_values=("native", "cross_compile", "embed_only")),
@@ -62,6 +64,8 @@ vars.AddVariables(
     PathVariable("install_dir", "Specify sub-folder for the install", "", PathVariable.PathAccept),
     BoolVariable("exceptions", "Enable/disable C++ exception support", True),
     PathVariable("linker_script", "Use an external linker script", "", PathVariable.PathAccept),
+    ListVariable("custom_options", "Custom options that can be used to turn on/off features", "none", ["disable_mmla_fp"]),
+    ListVariable("data_type_support", "Enable a list of data types to support", "all", ["qasymm8", "qasymm8_signed", "qsymm16", "fp16", "fp32"]),
     ("toolchain_prefix", "Override the toolchain prefix", ""),
     ("compiler_prefix", "Override the compiler prefix", ""),
     ("extra_cxx_flags", "Extra CXX flags to be appended to the build command", ""),
@@ -204,7 +208,9 @@ elif 'v8' in env['arch']:
         env.Append(CXXFLAGS = ['-march=armv8-a'])
 
     if 'v8.6-a' in env['arch']:
-        env.Append(CPPDEFINES = ['MMLA_INT8', 'MMLA_FP32', 'V8P6', 'V8P6_BF', 'ARM_COMPUTE_FORCE_BF16'])
+        env.Append(CPPDEFINES = ['MMLA_INT8', 'V8P6', 'V8P6_BF', 'ARM_COMPUTE_FORCE_BF16'])
+        if "disable_mmla_fp" not in env['custom_options']:
+            env.Append(CPPDEFINES = ['MMLA_FP32'])
 
 elif 'x86' in env['arch']:
     if env['estate'] == '32':
@@ -256,7 +262,7 @@ env['RANLIB'] = prefix + "ranlib"
 
 if not GetOption("help"):
     try:
-        compiler_ver = subprocess.check_output(env['CXX'].split() + ["-dumpversion"]).strip()
+        compiler_ver = subprocess.check_output(env['CXX'].split() + ["-dumpversion"]).decode().strip()
     except OSError:
         print("ERROR: Compiler '%s' not found" % env['CXX'])
         Exit(1)
@@ -277,6 +283,21 @@ if not GetOption("help"):
         if compiler_ver == '4.8.3':
             env.Append(CXXFLAGS = ['-Wno-array-bounds'])
 
+        if not version_at_least(compiler_ver, '7.0.0') and env['os'] == 'bare_metal':
+            env.Append(LINKFLAGS = ['-fstack-protector-strong'])
+
+if env['data_type_support']:
+    if any(i in env['data_type_support'] for i in ['all', 'fp16']):
+        env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS'])
+    if any(i in env['data_type_support'] for i in ['all', 'fp32']):
+        env.Append(CXXFLAGS = ['-DENABLE_FP32_KERNELS'])
+    if any(i in env['data_type_support'] for i in ['all', 'qasymm8']):
+        env.Append(CXXFLAGS = ['-DENABLE_QASYMM8_KERNELS'])
+    if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']):
+        env.Append(CXXFLAGS = ['-DENABLE_QASYMM8_SIGNED_KERNELS'])
+    if any(i in env['data_type_support'] for i in ['all', 'qsymm16']):
+        env.Append(CXXFLAGS = ['-DENABLE_QSYMM16_KERNELS'])
+
 if env['standalone']:
     env.Append(CXXFLAGS = ['-fPIC'])
     env.Append(LINKFLAGS = ['-static-libgcc','-static-libstdc++'])
@@ -293,6 +314,8 @@ elif env['os'] == 'bare_metal':
     env.Append(CXXFLAGS = ['-fPIC'])
     env.Append(CPPDEFINES = ['NO_MULTI_THREADING'])
     env.Append(CPPDEFINES = ['BARE_METAL'])
+if env['os'] == 'linux' and env['arch'] == 'armv7a':
+    env.Append(CXXFLAGS = [ '-Wno-psabi' ])
 
 if env['opencl']:
     if env['os'] in ['bare_metal'] or env['standalone']:
@@ -338,7 +361,6 @@ for dirname in os.listdir("./include"):
 
 Export('version_at_least')
 
-
 if env['gles_compute'] and env['os'] != 'android':
     env.Append(CPPPATH = ['#/include/linux'])
 
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000000..5b232a83d3
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,69 @@
+**Reporting vulnerabilities**
+
+Arm takes security issues seriously and welcomes feedback from researchers and the security community in order to improve the security of its products and services. We operate a coordinated disclosure policy for disclosing vulnerabilities and other security issues.
+
+Security issues can be complex and one single timescale doesn't fit all circumstances. We will make best endeavours to inform you when we expect security notifications and fixes to be available and facilitate coordinated disclosure when notifications and patches/mitigations are available.
+
+**Report**
+
+For all security issues, contact Arm by email at [arm-security@arm.com](mailto:arm-security@arm.com).
+
+
+**Secure submission using PGP**
+
+We support and encourage secure submission of vulnerability reports using PGP, using the key below. If you would like replies to be encrypted, please provide your own public key through a secure mechanism.
+
+~~~
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+mQINBFr7/RMBEACjHR5QZL/z1t2aLCRNXLE4KJiQmCo7edU5Be+7MTjIJDzZNu68
+lNEUYRoLexeayif8eC4T19bUsSbGpxHiYsFFjV8ewLXDyDJRRuaBGPfQ5rn/mE6X
+Nvu+9Pputr+mB1R3CXcvrNkhmzPkK7zVM15oeyBMhogqPssuT4OeMduQdip8smfK
+xTMk91RrJTLb+G3eE1tf+81kXBYvzp2e24Sn0/VeYe0IWnBobjVBZk3TmcYxDvz5
+Y47fU9V6cNj3Zq4VYrgxuLoFCA2VtetyiFQm5IYa3Bt3SWcAwihr8nbR2HoNdWyA
+u8wJYYVzSq3hvT5l/IjTHxEcY+6RBq8poDSsftzvX386u9hmw7sJQFlTw6/pUjdr
+gbsZ2ZzRBzKtU17ercpn4kU6VgVP3WRB5HiTFFkEpZuqAznOYaHbMq4dfd/g7Quq
+C0VTbWiJnhku2i+g4BdHHRDtIF6U3aVQAfbrDb1LjVTa65p5ULOeY3HRAWtMNtu/
+Cj8cD98JDanzXtcnisds8vMQ8LZ6iMFChEnF8K4V0eLw9Ju6CMNiFYY7SEBndD/H
+M4KcU4li7mROSbJcshgEbe1SYkxdMuI9eY4DNYxl3VjxoPUGzeqXo/ADFKE9bHsi
+GTyEoij4ku0HspLVKnYHXn/LqHGwEcwjF8zphS+w5cn/e01akYwz5EVSQwARAQAB
+tB1Bcm0gU3VwcG9ydCA8c3VwcG9ydEBhcm0uY29tPokCTgQTAQgAOBYhBN9zqDwZ
+RL/vF0ihcdfNKdz4bBRiBQJa+/0TAhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheA
+AAoJENfNKdz4bBRibK0P/jLlJR/QYypcjb+8BnHT9tCDgcV2KFYXS15VpbSNviN9
+Xs/UOnSadoGUMGCXDyb1PRNn98yUn7JlNR9rtsqPRmkpbo5cuw46ehgxjVlfcHnk
+CttaE8Davx6zo0fyrBib2+oVVW8usi9+uRK4vhhPUicO3oXwzldsVFz+RbpubZxc
+Bg/CZ+dQ2jMKOv1zDtInOG6OBnbQZRAeiWXgGhcIoPZ4zBQOi8nr0+bLcfvMeZi2
+uz6uKnylpXwZbl4ijcG8MKV/ei+7du+SzA9NY0WOT2g3FXDREWUhjKs8bmEZgIx4
+QgvDNpxAUThF+TqQ7zrsA8nT8POvDD0MhN/Z+A3QdPTdcaZFaXzIdxbDg+0FKmzu
+OgtQBH4C01RWrkmZlhO5w7/Qjt0vLlhfyQIL9BW/HeEPtjnH2Hnq8xYnZhlVqJEh
+FJU7F9sMvyiJiKviobFTd6AmpVkhxhcJ3k2L2C03imTsmUwAoejQCXwiYcOhyQ2t
+Z9Nk8YIZTEw2urGFi4HSQPwPq2j/2j7ABJ4rlzJvO6vs5ppGkumvzIIP9JnpVXbp
+wcbK6Ev6KdkX4s14Mzd6Hsd8LpL8t5nHhxUey6G0xKe2eSlHVm5Mlfhoete9UmIZ
+dzIOZkgTgWXlYXRIxwGQ2Pss7pURtofykvLklq4jcobQuHxurl9cteodETfbWk/J
+uQINBFr7/RMBEADWZG8eqt5D22g3T7ehnH/T3fuTX8LPUBnODMWGAEUY8uv64To8
+46odvrXFgWBgCf0sROEyJchc3SGLyR9S4lJsVJRde3QLN3WZkHlB4pSn4IQHFyQd
+wsLQi+S9uggHMPlQ6MNvc5n0P3k5bT9fLUmtJWJ3QVjW7k963ZXpzf1zbQJqs30w
+rlqGUZllfRoYQTfcYxFEaUFhwRJ//skNImWH8Vz+PTnqg2zRtFn3usrBV4GpNvsM
+6jy+YEsSvUa7IY8k4wpPzEhIfqDjGbZxFSQ1H1G+mLUL+DD7oGffej/ZoC86TIdM
+p6ew1rGhJdQBLh9nx+1ADOLWjNo2R0h60u7VR5q/K6V4fwWmeGFipPXZCD92I+nR
+t/cjznwNyD/6J9YrBMF7mbGrS1TyfLaLt4tpdcBnsgqDTodd5OmG65mroXsg/lNO
+7YZdecLZ34krfaLrWTtKkqULXbppB+uQvbVj8p8ONRImn6bZ+iAhnNaH9wJ06ico
+b1F0imJ2SJWnFr6PzPRr0gPStLgu9wrRKheaORwF/H/HxSyPZxNVxFqu81q518A/
+plhub9INQLaxHf/TTjXpqZCcfdNTYUAW8rwbQfW9doSIT4lHY8bJXktb6BsVjkFj
+PzDeYpXeOoTWetQqsEuTdg/F+qg041QBLtNj9Lr3Vy3StgMciRUIP8m0XwARAQAB
+iQI2BBgBCAAgFiEE33OoPBlEv+8XSKFx180p3PhsFGIFAlr7/RMCGwwACgkQ180p
+3PhsFGLWMA//V/XKrnI2YBh/SptUrgg7knPXva45bb7tGSH1fJg8f/wqycOSFFCY
+ES45boA5jlQ3z8uw6BYCz5KeOucGhxAMw+x5EDdxZ33ksY5zqXB35WaMXzEwGYYb
+E113/yhOsTbzu4bBKABSXbJO98MdAWvWpyCpp2MHIR3S9+ycM7/FMZ5xi3czZNRg
+9+WZP+7W4qWhJptQ0kBh5C3N/tiltju5WQ2Y7XIn+5dMOJdtseFS7CNerxXZGAtH
+nfRxaD/4ENdbWOwaVJiVW7+ioUJz09OWgy0gLYSDW+hciDnW1QAaJLpdAbniGZ0S
+JsTmaZla8JnUKqZPgbFfA2OcnH9H+DWc0pHv17c5tJzTMP7rgirgGRX/U2LOzmFZ
+1UxjQj5nn3Oa5frXbIAzb8xKiR0VDaquCM/3sti1AesYiS0Gw0Sqnw8qpFypgFXN
+CKVgYXppIT+TmbDbNJDOB2UycxeI4vbiBwU8fI4qSpW12WsGdAJt/rx3UsyhZ+02
+4aSqDHzhJmtDPQ6lnaKe1fUkC90tgp8loVGmriWQx82jAQMqATVjIklTpE4vm00f
+ocQIWOKEE90mKNEoV6rNbfl5QevmapTVdV/pmrRBzhbsa1uAUS4HZdH0Nf/OXEyv
+yYCr2gCFPymkkRYhPr2w5EgbWyzLaBIwqjyIbXaveuB3DYi2Lhbf64I=
+=EaN7
+-----END PGP PUBLIC KEY BLOCK-----
+~~~
+
+For more information visit https://developer.arm.com/support/arm-security-updates/report-security-vulnerabilities
\ No newline at end of file
diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h
index f54fd0186a..6f6dc18b85 100644
--- a/arm_compute/core/CL/CLCompileContext.h
+++ b/arm_compute/core/CL/CLCompileContext.h
@@ -118,6 +118,14 @@ class Program final
     {
         return _name;
     }
+    /** Returns program binary data.
+     *
+     * @return Program's binary data.
+     */
+    const std::vector<unsigned char> &binary() const
+    {
+        return _binary;
+    }
     /** User-defined conversion to the underlying CL program.
      *
      * @return The CL program object.
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
deleted file mode 100644
index dbda0dbb4b..0000000000
--- a/arm_compute/core/CL/CLKernels.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLKERNELS_H
-#define ARM_COMPUTE_CLKERNELS_H
-
-/* Header regrouping all the CL kernels */
-#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
-#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
-#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
-#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
-#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
-#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
-#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
-#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
-#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
-#include "arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLCropKernel.h"
-#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
-#include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
-#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h"
-#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
-#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
-#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
-#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
-#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
-#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
-#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
-#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
-#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
-#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
-#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
-#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
-#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
-#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
-#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
-#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
-#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLStackLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
-#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
-#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
-#include "arm_compute/core/CL/kernels/CLTileKernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
-#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
-#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
-#include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
-
-#endif /* ARM_COMPUTE_CLKERNELS_H */
diff --git a/arm_compute/core/CL/CLTypes.h b/arm_compute/core/CL/CLTypes.h
index c44e2c4f3f..0f6eb0dfa4 100644
--- a/arm_compute/core/CL/CLTypes.h
+++ b/arm_compute/core/CL/CLTypes.h
@@ -75,5 +75,40 @@ struct CLQuantization
     const ICLFloatArray *scale;  /**< Quantization scale array */
     const ICLInt32Array *offset; /**< Quantization offset array */
 };
+
+/** Internal keypoint structure for Lucas-Kanade Optical Flow */
+struct CLLKInternalKeypoint
+{
+    float x{ 0.f };               /**< x coordinate of the keypoint */
+    float y{ 0.f };               /**< y coordinate of the keypoint */
+    float tracking_status{ 0.f }; /**< the tracking status of the keypoint */
+    float dummy{ 0.f };           /**< Dummy field, to make sure the data structure 128-bit align, so that GPU can use vload4 */
+};
+
+/** Structure for storing Spatial Gradient Matrix and the minimum eigenvalue for each keypoint */
+struct CLCoefficientTable
+{
+    float A11;     /**< iA11 * FLT_SCALE */
+    float A12;     /**< iA11 * FLT_SCALE */
+    float A22;     /**< iA11 * FLT_SCALE */
+    float min_eig; /**< Minimum eigenvalue */
+};
+
+/** Structure for storing ival, ixval and iyval for each point inside the window */
+struct CLOldValue
+{
+    int16_t ival;  /**< ival extracts from old image */
+    int16_t ixval; /**< ixval extracts from scharr Gx image */
+    int16_t iyval; /**< iyval extracts from scharr Gy image */
+    int16_t dummy; /**< Dummy field, to make sure the data structure 128-bit align, so that GPU can use vload4 */
+};
+
+/** Interface for OpenCL Array of Internal Key Points. */
+using ICLLKInternalKeypointArray = ICLArray<CLLKInternalKeypoint>;
+/** Interface for OpenCL Array of Coefficient Tables. */
+using ICLCoefficientTableArray = ICLArray<CLCoefficientTable>;
+/** Interface for OpenCL Array of Old Values. */
+using ICLOldValArray = ICLArray<CLOldValue>;
+
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_TYPES_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
deleted file mode 100644
index 16990c54f4..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** CL kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QSYMM16 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
- *
- */
-class CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor info. Data type supported: S32
-     * @param[in] bias   Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor info with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
deleted file mode 100644
index ef962d834a..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
- */
-class CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = 0, int max = 0);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
deleted file mode 100644
index ca13b2fefb..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- */
-class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = 0, int max = 0);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
index 1a3f2ba679..068b37d80c 100644
--- a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H
 #define ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h b/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
index ddb346dfc2..e4fd250a61 100644
--- a/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H
 #define ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H
 
+#include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 
 #include "support/Mutex.h"
 
@@ -39,7 +39,7 @@ using IImage = ITensor;
 
 /** CPP kernel to perform corner candidates
  */
-class CPPCornerCandidatesKernel : public INEKernel
+class CPPCornerCandidatesKernel : public ICPPKernel
 {
 public:
     const char *name() const override
diff --git a/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
index dd6bbd56e0..5275a357b3 100644
--- a/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H
 #define ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H
 
+#include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
@@ -53,6 +53,8 @@ class CPPDetectionWindowNonMaximaSuppressionKernel : public ICPPKernel
     CPPDetectionWindowNonMaximaSuppressionKernel(CPPDetectionWindowNonMaximaSuppressionKernel &&) = default;
     /** Allow instances of this class to be moved */
     CPPDetectionWindowNonMaximaSuppressionKernel &operator=(CPPDetectionWindowNonMaximaSuppressionKernel &&) = default;
+    /** Default destructor */
+    ~CPPDetectionWindowNonMaximaSuppressionKernel() = default;
     /** Initialise the kernel's input, output and the euclidean minimum distance
      *
      * @attention: If @ref IDetectionWindowArray is passed to the kernel, the map() and unmap() methods @ref IDetectionWindowArray must be called respectively before and after
diff --git a/arm_compute/core/GPUTarget.h b/arm_compute/core/GPUTarget.h
index 06025ca3ae..b8143f8d5c 100644
--- a/arm_compute/core/GPUTarget.h
+++ b/arm_compute/core/GPUTarget.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_GPUTARGET_H
 #define ARM_COMPUTE_GPUTARGET_H
 
-#include "arm_compute/core/Helpers.h"
+#include "support/Traits.h"
 
 #include <string>
 
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index d056f937da..5a8d6efe9d 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -24,23 +24,17 @@
 #ifndef ARM_COMPUTE_HELPERS_H
 #define ARM_COMPUTE_HELPERS_H
 
-#include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Steps.h"
-#include "arm_compute/core/Strides.h"
-#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "support/MemorySupport.h"
 
 #include <array>
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 #include <tuple>
-#include <type_traits>
-#include <utility>
 
 namespace arm_compute
 {
@@ -48,307 +42,6 @@ class IKernel;
 class ITensor;
 class ITensorInfo;
 
-/** Disable bitwise operations by default */
-template <typename T>
-struct enable_bitwise_ops
-{
-    static constexpr bool value = false; /**< Disabled */
-};
-
-#ifndef DOXYGEN_SKIP_THIS
-template <typename T>
-typename std::enable_if<enable_bitwise_ops<T>::value, T>::type operator&(T lhs, T rhs)
-{
-    using underlying_type = typename std::underlying_type<T>::type;
-    return static_cast<T>(static_cast<underlying_type>(lhs) & static_cast<underlying_type>(rhs));
-}
-#endif /* DOXYGEN_SKIP_THIS */
-
-/** Helper function to create and return a unique_ptr pointed to a CL/GLES kernel object
- *  It also calls the kernel's configuration.
- *
- * @param[in] args All the arguments that need pass to kernel's configuration.
- *
- * @return A unique pointer pointed to a CL/GLES kernel object
- */
-template <typename Kernel, typename... T>
-std::unique_ptr<Kernel> create_configure_kernel(T &&... args)
-{
-    std::unique_ptr<Kernel> k = arm_compute::support::cpp14::make_unique<Kernel>();
-    k->configure(std::forward<T>(args)...);
-    return k;
-}
-
-/** Helper function to create and return a unique_ptr pointed to a CL/GLES kernel object
- *
- * @return A unique pointer pointed to a Kernel kernel object
- */
-template <typename Kernel>
-std::unique_ptr<Kernel> create_kernel()
-{
-    std::unique_ptr<Kernel> k = arm_compute::support::cpp14::make_unique<Kernel>();
-    return k;
-}
-
-namespace traits
-{
-/** Check if a type T is contained in a tuple Tuple of types */
-template <typename T, typename Tuple>
-struct is_contained;
-
-template <typename T>
-struct is_contained<T, std::tuple<>> : std::false_type
-{
-};
-
-template <typename T, typename... Ts>
-struct is_contained<T, std::tuple<T, Ts...>> : std::true_type
-{
-};
-
-template <typename T, typename U, typename... Ts>
-struct is_contained<T, std::tuple<U, Ts...>> : is_contained<T, std::tuple<Ts...>>
-{
-};
-}
-
-/** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
- * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
- * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
- * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
- *
- * @note dx and dy must be in the range [0, 1.0]
- *
- * @return The bilinear interpolated pixel value
- */
-template <typename T>
-inline T delta_bilinear_c1(const T *pixel_ptr, size_t stride, float dx, float dy)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-
-    const T a00 = *pixel_ptr;
-    const T a01 = *(pixel_ptr + 1);
-    const T a10 = *(pixel_ptr + stride);
-    const T a11 = *(pixel_ptr + stride + 1);
-
-    const float w1 = dx1 * dy1;
-    const float w2 = dx * dy1;
-    const float w3 = dx1 * dy;
-    const float w4 = dx * dy;
-
-    return static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
-}
-
-/** Computes bilinear interpolation for quantized input and output, using the pointer to the top-left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be QASYMM8 and in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
- * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
- * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
- * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
- * @param[in] iq_info   Input QuantizationInfo
- * @param[in] oq_info   Output QuantizationInfo
- *
- * @note dx and dy must be in the range [0, 1.0]
- *
- * @return The bilinear interpolated pixel value
- */
-inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy, UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-
-    const float a00 = dequantize_qasymm8(*pixel_ptr, iq_info);
-    const float a01 = dequantize_qasymm8(*(pixel_ptr + 1), iq_info);
-    const float a10 = dequantize_qasymm8(*(pixel_ptr + stride), iq_info);
-    const float a11 = dequantize_qasymm8(*(pixel_ptr + stride + 1), iq_info);
-
-    const float w1  = dx1 * dy1;
-    const float w2  = dx * dy1;
-    const float w3  = dx1 * dy;
-    const float w4  = dx * dy;
-    float       res = a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
-    return static_cast<uint8_t>(quantize_qasymm8(res, oq_info));
-}
-
-/** Computes bilinear interpolation for quantized input and output, using the pointer to the top-left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be QASYMM8_SIGNED and in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
- * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
- * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
- * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
- * @param[in] iq_info   Input QuantizationInfo
- * @param[in] oq_info   Output QuantizationInfo
- *
- * @note dx and dy must be in the range [0, 1.0]
- *
- * @return The bilinear interpolated pixel value
- */
-inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride, float dx, float dy, UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-
-    const float a00 = dequantize_qasymm8_signed(*pixel_ptr, iq_info);
-    const float a01 = dequantize_qasymm8_signed(*(pixel_ptr + 1), iq_info);
-    const float a10 = dequantize_qasymm8_signed(*(pixel_ptr + stride), iq_info);
-    const float a11 = dequantize_qasymm8_signed(*(pixel_ptr + stride + 1), iq_info);
-
-    const float w1  = dx1 * dy1;
-    const float w2  = dx * dy1;
-    const float w3  = dx1 * dy;
-    const float w4  = dx * dy;
-    float       res = a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
-    return static_cast<int8_t>(quantize_qasymm8_signed(res, oq_info));
-}
-
-/** Computes linear interpolation using the pointer to the top pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the top pixel value of a single channel input.
- * @param[in] stride    Stride to access the bottom pixel value
- * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
- *
- * @note dy must be in the range [0, 1.0]
- *
- * @return The linear interpolated pixel value
- */
-template <typename T>
-inline T delta_linear_c1_y(const T *pixel_ptr, size_t stride, float dy)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dy1 = 1.0f - dy;
-
-    const T a00 = *pixel_ptr;
-    const T a10 = *(pixel_ptr + stride);
-
-    const float w1 = dy1;
-    const float w3 = dy;
-
-    return static_cast<T>(a00 * w1 + a10 * w3);
-}
-/** Computes linear interpolation using the pointer to the left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the left pixel value of a single channel input.
- * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
- *
- * @note dx must be in the range [0, 1.0]
- *
- * @return The linear interpolated pixel value
- */
-template <typename T>
-inline T delta_linear_c1_x(const T *pixel_ptr, float dx)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const T a00 = *pixel_ptr;
-    const T a01 = *(pixel_ptr + 1);
-
-    const float dx1 = 1.0f - dx;
-
-    const float w1 = dx1;
-    const float w2 = dx;
-
-    return static_cast<T>(a00 * w1 + a01 * w2);
-}
-/** Return the pixel at (x,y) using bilinear interpolation.
- *
- * @warning Only works if the iterator was created with an IImage
- *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel input.
- * @param[in] stride          Stride in bytes of the image;
- * @param[in] x               X position of the wanted pixel
- * @param[in] y               Y position of the wanted pixel
- *
- * @return The pixel at (x, y) using bilinear interpolation.
- */
-template <typename T>
-inline T pixel_bilinear_c1(const T *first_pixel_ptr, size_t stride, float x, float y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    const int32_t xi = std::floor(x);
-    const int32_t yi = std::floor(y);
-
-    const float dx = x - xi;
-    const float dy = y - yi;
-
-    return delta_bilinear_c1(first_pixel_ptr + xi + yi * stride, stride, dx, dy);
-}
-
-/** Return the pixel at (x,y) using bilinear interpolation by clamping when out of borders. The image must be single channel input
- *
- * @warning Only works if the iterator was created with an IImage
- *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel image.
- * @param[in] stride          Stride in bytes of the image
- * @param[in] width           Width of the image
- * @param[in] height          Height of the image
- * @param[in] x               X position of the wanted pixel
- * @param[in] y               Y position of the wanted pixel
- *
- * @return The pixel at (x, y) using bilinear interpolation.
- */
-template <typename T>
-inline uint8_t pixel_bilinear_c1_clamp(const T *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
-    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
-
-    const float xi = std::floor(x);
-    const float yi = std::floor(y);
-
-    const float dx = x - xi;
-    const float dy = y - yi;
-
-    if(dx == 0.0f)
-    {
-        if(dy == 0.0f)
-        {
-            return static_cast<T>(first_pixel_ptr[static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride]);
-        }
-        return delta_linear_c1_y(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dy);
-    }
-    if(dy == 0.0f)
-    {
-        return delta_linear_c1_x(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, dx);
-    }
-    return delta_bilinear_c1(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dx, dy);
-}
-
-/** Return the pixel at (x,y) using area interpolation by clamping when out of borders. The image must be single channel U8
- *
- * @note The interpolation area depends on the width and height ration of the input and output images
- * @note Currently average of the contributing pixels is calculated
- *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
- * @param[in] stride          Stride in bytes of the image
- * @param[in] width           Width of the image
- * @param[in] height          Height of the image
- * @param[in] wr              Width ratio among the input image width and output image width.
- * @param[in] hr              Height ratio among the input image height and output image height.
- * @param[in] x               X position of the wanted pixel
- * @param[in] y               Y position of the wanted pixel
- *
- * @return The pixel at (x, y) using area interpolation.
- */
-inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y);
-
 /** Iterator updated by @ref execute_window_loop for each window element */
 class Iterator
 {
@@ -376,7 +69,7 @@ class Iterator
      *
      * @return The current position of the iterator in bytes relative to the first element.
      */
-    constexpr int offset() const;
+    constexpr size_t offset() const;
 
     /** Return a pointer to the current pixel.
      *
@@ -403,8 +96,8 @@ class Iterator
         {
         }
 
-        int _dim_start;
-        int _stride;
+        size_t _dim_start;
+        size_t _stride;
     };
 
     std::array<Dimension, Coordinates::num_max_dimensions> _dims;
@@ -421,179 +114,6 @@ class Iterator
 template <typename L, typename... Ts>
 inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators);
 
-/** Update window and padding size for each of the access patterns.
- *
- * First the window size is reduced based on all access patterns that are not
- * allowed to modify the padding of the underlying tensor. Then the padding of
- * the remaining tensors is increased to match the window.
- *
- * @param[in] win      Window that is used by the kernel.
- * @param[in] patterns Access patterns used to calculate the final window and padding.
- *
- * @return True if the window has been changed. Changes to the padding do not
- *         influence the returned value.
- */
-template <typename... Ts>
-bool update_window_and_padding(Window &win, Ts &&... patterns)
-{
-    bool window_changed = false;
-
-    utility::for_each([&](const IAccessWindow & w)
-    {
-        window_changed |= w.update_window_if_needed(win);
-    },
-    patterns...);
-
-    bool padding_changed = false;
-
-    utility::for_each([&](IAccessWindow & w)
-    {
-        padding_changed |= w.update_padding_if_needed(win);
-    },
-    patterns...);
-
-    return window_changed;
-}
-
-/** Calculate the maximum window for a given tensor shape and border setting
- *
- * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
- * @param[in] steps        (Optional) Number of elements processed for each step.
- * @param[in] skip_border  (Optional) If true exclude the border region from the window.
- * @param[in] border_size  (Optional) Border size.
- *
- * @return The maximum window the kernel can be executed on.
- */
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
-
-/** Calculate the maximum window for a given tensor shape and border setting
- *
- * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
- * @param[in] steps       (Optional) Number of elements processed for each step.
- * @param[in] skip_border (Optional) If true exclude the border region from the window.
- * @param[in] border_size (Optional) Border size.
- *
- * @return The maximum window the kernel can be executed on.
- */
-inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
-{
-    return calculate_max_window(info.valid_region(), steps, skip_border, border_size);
-}
-
-/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
- *
- * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
- * @param[in] steps        (Optional) Number of elements processed for each step.
- * @param[in] skip_border  (Optional) If true exclude the border region from the window.
- * @param[in] border_size  (Optional) Border size. The border region will be excluded from the window.
- *
- * @return The maximum window the kernel can be executed on.
- */
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
-
-/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
- *
- * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
- * @param[in] steps       (Optional) Number of elements processed for each step.
- * @param[in] skip_border (Optional) If true exclude the border region from the window.
- * @param[in] border_size (Optional) Border size.
- *
- * @return The maximum window the kernel can be executed on.
- */
-inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
-{
-    return calculate_max_window_horizontal(info.valid_region(), steps, skip_border, border_size);
-}
-
-/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
- *
- * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
- * @param[in] steps        (Optional) Number of elements processed for each step.
- * @param[in] border_size  (Optional) Border size. The border region will be included in the window.
- *
- * @return The maximum window the kernel can be executed on.
- */
-Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
-
-/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
- *
- * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
- * @param[in] steps       (Optional) Number of elements processed for each step.
- * @param[in] border_size (Optional) Border size. The border region will be included in the window.
- *
- * @return The maximum window the kernel can be executed on.
- */
-inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize())
-{
-    return calculate_max_enlarged_window(info.valid_region(), steps, border_size);
-}
-
-/** Intersect multiple valid regions.
- *
- * @param[in] regions Valid regions.
- *
- * @return Intersection of all regions.
- */
-template <typename... Ts>
-ValidRegion intersect_valid_regions(const Ts &... regions)
-{
-    auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion
-    {
-        ValidRegion region;
-
-        for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
-        {
-            region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d]));
-        }
-
-        for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
-        {
-            region.shape.set(d, std::min(r1.shape[d], r2.shape[d]));
-        }
-
-        return region;
-    };
-
-    return utility::foldl(intersect, regions...);
-}
-
-/** Create a strides object based on the provided strides and the tensor dimensions.
- *
- * @param[in] info          Tensor info object providing the shape of the tensor for unspecified strides.
- * @param[in] stride_x      Stride to be used in X dimension (in bytes).
- * @param[in] fixed_strides Strides to be used in higher dimensions starting at Y (in bytes).
- *
- * @return Strides object based on the specified strides. Missing strides are
- *         calculated based on the tensor shape and the strides of lower dimensions.
- */
-template <typename T, typename... Ts>
-inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
-{
-    const TensorShape &shape = info.tensor_shape();
-
-    // Create strides object
-    Strides strides(stride_x, fixed_strides...);
-
-    for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
-    {
-        strides.set(i, shape[i - 1] * strides[i - 1]);
-    }
-
-    return strides;
-}
-
-/** Create a strides object based on the tensor dimensions.
- *
- * @param[in] info Tensor info object used to compute the strides.
- *
- * @return Strides object based on element size and tensor shape.
- */
-template <typename... Ts>
-inline Strides compute_strides(const ITensorInfo &info)
-{
-    return compute_strides(info, info.element_size());
-}
-
 /** Permutes given Dimensions according to a permutation vector
  *
  * @warning Validity of permutation is not checked
@@ -629,79 +149,6 @@ inline void permute(TensorShape &shape, const PermutationVector &perm)
     }
 }
 
-/** Auto initialize the tensor info (shape, number of channels and data type) if the current assignment is empty.
- *
- * @param[in,out] info              Tensor info used to check and assign.
- * @param[in]     shape             New shape.
- * @param[in]     num_channels      New number of channels.
- * @param[in]     data_type         New data type
- * @param[in]     quantization_info (Optional) New quantization info
- *
- * @return True if the tensor info has been initialized
- */
-bool auto_init_if_empty(ITensorInfo       &info,
-                        const TensorShape &shape,
-                        int num_channels, DataType data_type,
-                        QuantizationInfo quantization_info = QuantizationInfo());
-
-/** Auto initialize the tensor info using another tensor info.
- *
- * @param info_sink   Tensor info used to check and assign
- * @param info_source Tensor info used to assign
- *
- * @return True if the tensor info has been initialized
- */
-bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source);
-
-/** Set the shape to the specified value if the current assignment is empty.
- *
- * @param[in,out] info  Tensor info used to check and assign.
- * @param[in]     shape New shape.
- *
- * @return True if the shape has been changed.
- */
-bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape);
-
-/** Set the format, data type and number of channels to the specified value if
- * the current data type is unknown.
- *
- * @param[in,out] info   Tensor info used to check and assign.
- * @param[in]     format New format.
- *
- * @return True if the format has been changed.
- */
-bool set_format_if_unknown(ITensorInfo &info, Format format);
-
-/** Set the data type and number of channels to the specified value if
- * the current data type is unknown.
- *
- * @param[in,out] info      Tensor info used to check and assign.
- * @param[in]     data_type New data type.
- *
- * @return True if the data type has been changed.
- */
-bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type);
-
-/** Set the data layout to the specified value if
- * the current data layout is unknown.
- *
- * @param[in,out] info        Tensor info used to check and assign.
- * @param[in]     data_layout New data layout.
- *
- * @return True if the data type has been changed.
- */
-bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout);
-
-/** Set the quantization info to the specified value if
- * the current quantization info is empty and the data type of asymmetric quantized type
- *
- * @param[in,out] info              Tensor info used to check and assign.
- * @param[in]     quantization_info Quantization info
- *
- * @return True if the quantization info has been changed.
- */
-bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info);
-
 /** Helper function to calculate the Valid Region for Scale.
  *
  * @param[in] src_info           Input tensor info used to check.
@@ -751,21 +198,6 @@ inline size_t get_data_layout_dimension_index(const DataLayout data_layout, cons
  */
 inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout data_layout, const size_t index);
 
-/** Calculate the normalization dimension index for a given normalization type
- *
- * @param[in] layout Data layout of the input and output tensor
- * @param[in] info   Normalization info
- *
- * @return Normalization dimension index
- */
-inline unsigned int get_normalization_dimension_index(DataLayout layout, const NormalizationLayerInfo &info)
-{
-    const unsigned int width_idx   = get_data_layout_dimension_index(layout, DataLayoutDimension::WIDTH);
-    const unsigned int channel_idx = get_data_layout_dimension_index(layout, DataLayoutDimension::CHANNEL);
-
-    return info.is_in_map() ? width_idx : channel_idx;
-}
-
 /** Calculate the number of output tiles required by Winograd Convolution layer. This utility function can be used by the Winograd input transform
  *  to know the number of tiles on the x and y direction
  *
@@ -801,16 +233,6 @@ inline T wrap_around(T x, T m)
     return x >= 0 ? x % m : (x % m + m) % m;
 }
 
-/** Convert a dimension axis to the number of dimensions in the range [0, @p dim_axis]
- * Handle negative axis, negative axis is used to specify axis from the end (e.g. -1 for the last axis).
- *
- * @param[in] dim_axis The last axis (inclusive) in the range [0, @p dim_axis]
- * @param[in] num_dims The total number of dimensions
- *
- * @return The number of dimensions in the range [0, @p dim_axis]
- */
-inline size_t dim_index_2_num_dims(int32_t dim_axis, int32_t num_dims);
-
 /** Convert negative coordinates to positive in the range [0, num_dims_input]
  *
  * @param[out] coords    Array of coordinates to be converted.
@@ -824,34 +246,6 @@ inline Coordinates &convert_negative_axis(Coordinates &coords, int max_value)
     }
     return coords;
 }
-
-/** Given an integer value, this function returns the next power of two
- *
- * @param[in] x Input value
- *
- * @return the next power of two
- */
-inline unsigned int get_next_power_two(unsigned int x)
-{
-    // Decrement by 1
-    x--;
-
-    // Shift right by 1
-    x |= x >> 1u;
-    // Shift right by 2
-    x |= x >> 2u;
-    // Shift right by 4
-    x |= x >> 4u;
-    // Shift right by 8
-    x |= x >> 8u;
-    // Shift right by 16
-    x |= x >> 16u;
-
-    // Increment by 1
-    x++;
-
-    return x;
-}
 } // namespace arm_compute
 
 #include "arm_compute/core/Helpers.inl"
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index 07b4132bea..a960876074 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl
@@ -22,63 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 
 #include <cmath>
 #include <numeric>
 
 namespace arm_compute
 {
-inline size_t dim_index_2_num_dims(int32_t dim_axis, int32_t num_dims)
-{
-    return static_cast<size_t>(wrap_around(dim_axis, num_dims)) + 1;
-}
-
-inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    // Calculate sampling position
-    float in_x = (x + 0.5f) * wr - 0.5f;
-    float in_y = (y + 0.5f) * hr - 0.5f;
-
-    // Get bounding box offsets
-    int x_from = std::floor(x * wr - 0.5f - in_x);
-    int y_from = std::floor(y * hr - 0.5f - in_y);
-    int x_to   = std::ceil((x + 1) * wr - 0.5f - in_x);
-    int y_to   = std::ceil((y + 1) * hr - 0.5f - in_y);
-
-    // Clamp position to borders
-    in_x = std::max(-1.f, std::min(in_x, static_cast<float>(width)));
-    in_y = std::max(-1.f, std::min(in_y, static_cast<float>(height)));
-
-    // Clamp bounding box offsets to borders
-    x_from = ((in_x + x_from) < -1) ? -1 : x_from;
-    y_from = ((in_y + y_from) < -1) ? -1 : y_from;
-    x_to   = ((in_x + x_to) > width) ? (width - in_x) : x_to;
-    y_to   = ((in_y + y_to) > height) ? (height - in_y) : y_to;
-
-    // Get pixel index
-    const int xi = std::floor(in_x);
-    const int yi = std::floor(in_y);
-
-    // Bounding box elements in each dimension
-    const int x_elements = (x_to - x_from + 1);
-    const int y_elements = (y_to - y_from + 1);
-    ARM_COMPUTE_ERROR_ON(x_elements == 0 || y_elements == 0);
-
-    // Sum pixels in area
-    int sum = 0;
-    for(int j = yi + y_from, je = yi + y_to; j <= je; ++j)
-    {
-        const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from;
-        sum                = std::accumulate(ptr, ptr + x_elements, sum);
-    }
-
-    // Return average
-    return sum / (x_elements * y_elements);
-}
-
 template <size_t dimension>
 struct IncrementIterators
 {
@@ -158,7 +107,7 @@ inline Iterator::Iterator(const ITensor *tensor, const Window &win)
     for(unsigned int n = 0; n < info->num_dimensions(); ++n)
     {
         _dims[n]._stride = win[n].step() * strides[n];
-        std::get<0>(_dims)._dim_start += strides[n] * win[n].start();
+        std::get<0>(_dims)._dim_start += static_cast<size_t>(strides[n]) * win[n].start();
     }
 
     //Copy the starting point to all the dimensions:
@@ -182,7 +131,7 @@ inline void Iterator::increment(const size_t dimension)
     }
 }
 
-inline constexpr int Iterator::offset() const
+inline constexpr size_t Iterator::offset() const
 {
     return _dims.at(0)._dim_start;
 }
@@ -204,94 +153,6 @@ inline void Iterator::reset(const size_t dimension)
     }
 }
 
-inline bool auto_init_if_empty(ITensorInfo       &info,
-                               const TensorShape &shape,
-                               int                num_channels,
-                               DataType           data_type,
-                               QuantizationInfo   quantization_info)
-{
-    if(info.tensor_shape().total_size() == 0)
-    {
-        info.set_data_type(data_type);
-        info.set_num_channels(num_channels);
-        info.set_tensor_shape(shape);
-        info.set_quantization_info(quantization_info);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source)
-{
-    if(info_sink.tensor_shape().total_size() == 0)
-    {
-        info_sink.set_data_type(info_source.data_type());
-        info_sink.set_num_channels(info_source.num_channels());
-        info_sink.set_tensor_shape(info_source.tensor_shape());
-        info_sink.set_quantization_info(info_source.quantization_info());
-        info_sink.set_data_layout(info_source.data_layout());
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
-{
-    if(info.tensor_shape().total_size() == 0)
-    {
-        info.set_tensor_shape(shape);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_format_if_unknown(ITensorInfo &info, Format format)
-{
-    if(info.data_type() == DataType::UNKNOWN)
-    {
-        info.set_format(format);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
-{
-    if(info.data_type() == DataType::UNKNOWN)
-    {
-        info.set_data_type(data_type);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout)
-{
-    if(info.data_layout() == DataLayout::UNKNOWN)
-    {
-        info.set_data_layout(data_layout);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
-{
-    if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
-    {
-        info.set_quantization_info(quantization_info);
-        return true;
-    }
-
-    return false;
-}
-
 inline Coordinates index2coords(const TensorShape &shape, int index)
 {
     int num_elements = shape.total_size();
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index c5f0949196..3eb7239460 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,8 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ICloneable.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+#include "support/ICloneable.h"
 
 #include <cstddef>
 
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index 1ee1686fb1..ea46bfa5a6 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -64,6 +64,7 @@ struct GEMMKernelInfo
         bool                ireinterpret_input_as_3d,
         bool                ibroadcast_bias,
         bool                ifp_mixed_precision,
+        bool                ihas_pad_y,
         ActivationLayerInfo iactivation_info,
         int                 inmult_transpose1xW_width,
         int                 imult_interleave4x4_height,
@@ -72,7 +73,7 @@ struct GEMMKernelInfo
         int32_t             ina_offset,
         int32_t             inb_offset)
         : m(im), n(in), k(ik), depth_output_gemm3d(idepth_output_gemm3d), reinterpret_input_as_3d(ireinterpret_input_as_3d), broadcast_bias(ibroadcast_bias), fp_mixed_precision(ifp_mixed_precision),
-          activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info), rhs_info(irhs_info),
+          has_pad_y(ihas_pad_y), activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info), rhs_info(irhs_info),
           a_offset(ina_offset), b_offset(inb_offset)
     {
     }
@@ -84,6 +85,7 @@ struct GEMMKernelInfo
     bool                    reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
     bool                    broadcast_bias{ false };          /**< Flag used to broadcast the bias addition */
     bool                    fp_mixed_precision{ false };      /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
+    bool                    has_pad_y{ false };               /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
     ActivationLayerInfo     activation_info{};                /**< Activation function to perform after the matrix multiplication */
     int                     mult_transpose1xW_width{ 1 };     /**< Multiplication factor for the width of the 1xW transposed block */
     int                     mult_interleave4x4_height{ 1 };   /**< Multiplication factor for the height of the 4x4 interleaved block */
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
deleted file mode 100644
index f5d3aec27a..0000000000
--- a/arm_compute/core/NEON/NEKernels.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEKERNELS_H
-#define ARM_COMPUTE_NEKERNELS_H
-
-/* Header regrouping all the NEON kernels */
-#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
-#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
-#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
-#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NECropKernel.h"
-#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
-#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTScaleKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
-#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
-#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
-#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
-#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPadLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NERangeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
-#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
-#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEStackLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
-#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
-#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
-#include "arm_compute/core/NEON/kernels/NETileKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
-
-#endif /* ARM_COMPUTE_NEKERNELS_H */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
deleted file mode 100644
index 6712e9105a..0000000000
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H
-#define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor. */
-class NEDepthwiseConvolutionLayer3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthwiseConvolutionLayer3x3Kernel";
-    }
-    /** Default constructor */
-    NEDepthwiseConvolutionLayer3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionLayer3x3Kernel(const NEDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionLayer3x3Kernel &operator=(const NEDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Default Move Constructor. */
-    NEDepthwiseConvolutionLayer3x3Kernel(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionLayer3x3Kernel &operator=(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @note Supported data layouts: NCHW and NHWC
-     *
-     * @param[in]  input            Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights          Weights tensor. This is a 3D tensor with dimensions [3, 3, IFM] for NCHW or [IFM, 3, 3] if NHWC data layout. Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     */
-    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U));
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3Kernel
-     *
-     * @note Supported data layouts: NCHW and NHWC
-     *
-     * @param[in] input            Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights          Weights tensor info. This is a 3D tensor with dimensions [3, 3, IFM] for NCHW or [IFM, 3, 3] if NHWC data layout. Data type supported: Same as @p input.
-     * @param[in] output           Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                           const Size2D &dilation = Size2D(1U, 1U));
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize     _border_size;
-    const ITensor *_input;
-    ITensor       *_output;
-    const ITensor *_weights;
-    PadStrideInfo  _conv_info;
-    unsigned int   _num_elems_written_per_iteration;
-    unsigned int   _depth_multiplier;
-    Size2D         _dilation;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
deleted file mode 100644
index a2f0e8c5a8..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMASSEMBLYBASE_H
-#define ARM_COMPUTE_NEGEMMASSEMBLYBASE_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Base class for GEMM NEON kernels implemented in Assembly. */
-class NEGEMMAssemblyBaseKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMAssemblyBaseKernel";
-    }
-    /** Constructor */
-    NEGEMMAssemblyBaseKernel()
-        : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _is_transposed_0(false), _is_transposed_1(false)
-    {
-    }
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMAssemblyBaseKernel(const NEGEMMAssemblyBaseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMAssemblyBaseKernel &operator=(const NEGEMMAssemblyBaseKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMAssemblyBaseKernel(NEGEMMAssemblyBaseKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMAssemblyBaseKernel &operator=(NEGEMMAssemblyBaseKernel &&) = default;
-
-    virtual ~NEGEMMAssemblyBaseKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * The computed function is C = a * AxB + b * C.
-     *
-     * @param[in]     input0          Input tensor containing the Matrix A. Data types supported: F32
-     * @param[in]     input1          Input tensor containing the Matrix B. Data types supported: same as @p input0
-     * @param[in,out] output          Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0.
-     * @param[out]    workspace       Space for intermediate results.
-     * @param[in]     alpha           Weight of the matrix product
-     * @param[in]     beta            Weight of the accumulation.
-     * @param[in]     is_transposed_0 (Optional)True if @p input0 is transposed else false. (Defaults to false)
-     * @param[in]     is_transposed_1 (Optional)True if @p input1 is transposed else false. (Defaults to false)
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool is_transposed_0 = false, bool is_transposed_1 = false)
-    {
-        internal_configure(input0, input1, output, workspace, alpha, beta, is_transposed_0, is_transposed_1);
-    }
-
-protected:
-    virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool _is_transposed_0, bool _is_transposed_1) = 0;
-
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
-    ITensor       *_workspace;
-    float          _alpha;
-    float          _beta;
-    bool           _is_transposed_0;
-    bool           _is_transposed_1;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMASSEMBLYBASE_H*/
diff --git a/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h b/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
deleted file mode 100644
index 74161e330e..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_INEGEMMWRAPPERKERNEL_H
-#define ARM_COMPUTE_INEGEMMWRAPPERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Common interface for all the arm_gemm Gemms
- */
-class INEGEMMWrapperKernel : public INEKernel
-{
-public:
-    /** Parameters defining the dimensions of the matrices being multiplied */
-    struct Params
-    {
-        unsigned int M{ 0 };       /**< Rows in output matrix C (and input matrix A). */
-        unsigned int N{ 0 };       /**< Columns in output matrix C (and input matrix B). */
-        unsigned int K{ 0 };       /**< Columns of input matrix A (= rows of input matrix B). */
-        unsigned int batches{ 0 }; /**< Number of "batched" GEMMs (unique A and C, shared B). */
-        unsigned int multis{ 0 };  /**< Number of "multi" GEMMs (unique A, B and C). */
-    };
-
-    static Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c, const GEMMInfo &gemm_info);
-
-    /** Constructor */
-    INEGEMMWrapperKernel();
-    /** Prevent instances of this class from being copied */
-    INEGEMMWrapperKernel(const INEGEMMWrapperKernel &) = delete;
-    /** Prevent instances of this class from being copied */
-    INEGEMMWrapperKernel &operator=(const INEGEMMWrapperKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEGEMMWrapperKernel(INEGEMMWrapperKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEGEMMWrapperKernel &operator=(INEGEMMWrapperKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note The input and output tensor must have the same dimensions
-     *
-     * @param[in]  a         Input tensor (Matrix A)
-     * @param[in]  b         Input tensor (Matrix B)
-     * @param[out] c         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in]  alpha     Scalar multiplier to apply to AB matrix product.
-     * @param[in]  beta      Scalar multiplier to apply to input C matrix before adding product.
-     * @param[in]  gemm_info GEMM meta-data
-     */
-    void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    /** Called as part of configure() after _a, _b, _c and _params have been set.
-     *
-     * @param[in] alpha Scalar multiplier to apply to AB matrix product.
-     * @param[in] beta  Scalar multiplier to apply to input C matrix before adding product.
-     *
-     * @return A 3D execution window.
-     */
-    virtual Window configure_internal(float alpha, float beta) = 0;
-
-    /** Run the kernel from the start to the end offset in window.
-     *
-     * @param[in] window       Window to use for the iteration
-     * @param[in] start_offset Where to start iterating from (In Window coordinates)
-     * @param[in] end_offset   Where to stop iterating (In Window coordinates).
-     * @param[in] info         Info about executing thread and CPU.
-     */
-    virtual void run_internal(const Window &window, const Coordinates &start_offset, const Coordinates &end_offset, const ThreadInfo &info) = 0;
-
-    const ITensor *_a;
-    const ITensor *_b;
-    ITensor       *_c;
-    Params         _params;
-    GEMMInfo       _gemm_info;
-
-private:
-    Window      _window3d;
-    TensorShape _window_shape;
-};
-
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_INEGEMMRAPPERKERNEL_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
deleted file mode 100644
index a30e723dc0..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_INTRINSICS_H
-#define ARM_COMPUTE_WRAPPER_INTRINSICS_H
-
-#include "arm_compute/core/NEON/wrapper/intrinsics/abs.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/add.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/and.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/bsl.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/ceq.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/cge.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/cgt.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/cle.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/clt.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/combine.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/cvt.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/div.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/dup_n.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/eor.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/exp.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/ext.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/gethigh.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/getlow.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/inv.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/load.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/log.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/max.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/min.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/mla.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/movl.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/movn.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/mul.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/neg.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/not.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/orr.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/pmax.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/pmin.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/pow.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/qmov.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/qmovun.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/rev64.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/round.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/setlane.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/sin.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/store.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/sub.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/tanh.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/tbl.h"
-
-#endif /* ARM_COMPUTE_WRAPPER_INTRINSICS_H */
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index f604f55924..6654ccf00a 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,6 @@
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Validate.h"
 
 #include <cstddef>
 #include <memory>
diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h
index 218774360e..b455a07767 100644
--- a/arm_compute/core/TensorShape.h
+++ b/arm_compute/core/TensorShape.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,7 @@
 namespace arm_compute
 {
 /** Shape of a tensor */
-class TensorShape : public Dimensions<uint32_t>
+class TensorShape : public Dimensions<size_t>
 {
 public:
     /** Constructor to initialize the tensor shape.
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 97505001ea..39cc29b0da 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -137,10 +137,11 @@ enum class DataLayoutDimension
 /** Available ConvolutionMethod*/
 enum class ConvolutionMethod
 {
-    GEMM,     /**< Convolution using GEMM */
-    DIRECT,   /**< Direct convolution */
-    WINOGRAD, /**< Convolution using Winograd */
-    FFT       /**< Convolution using FFT */
+    GEMM,        /**< Convolution using GEMM */
+    GEMM_CONV2D, /**< Direct 2D GEMM convolution */
+    DIRECT,      /**< Direct convolution */
+    WINOGRAD,    /**< Convolution using Winograd */
+    FFT          /**< Convolution using FFT */
 };
 
 /** Available DepthwiseConvolutionFunction*/
@@ -337,6 +338,28 @@ struct BorderSize
         return size;
     }
 
+    /** Check equality with another BorderSize struct
+     *
+     * @param[in] rhs other struct to check against
+     *
+     * @return true if they are equal
+     */
+    bool operator==(const BorderSize &rhs)
+    {
+        return (top == rhs.top) && (right == rhs.right) && (bottom == rhs.bottom) && (left == rhs.left);
+    }
+
+    /** Check non-equality with another BorderSize struct
+     *
+     * @param[in] rhs other struct to check against
+     *
+     * @return true if they are different
+     */
+    bool operator!=(const BorderSize &rhs)
+    {
+        return !(*this == rhs);
+    }
+
     /** Limit this border size.
      *
      * @param[in] limit Border size to limit this border size to.
@@ -523,13 +546,14 @@ enum class ArithmeticOperation
 /** Available element wise unary operations */
 enum class ElementWiseUnary
 {
-    RSQRT, /**< Reverse square root */
-    EXP,   /**< Exponential */
-    NEG,   /**< Negate */
-    LOG,   /**< Natural Logarithm */
-    ABS,   /**< Absolute value */
-    SIN,   /**< Sine */
-    ROUND, /**< Round */
+    RSQRT,       /**< Reverse square root */
+    EXP,         /**< Exponential */
+    NEG,         /**< Negate */
+    LOG,         /**< Natural Logarithm */
+    ABS,         /**< Absolute value */
+    SIN,         /**< Sine */
+    ROUND,       /**< Round */
+    LOGICAL_NOT, /**< Logical Not */
 };
 
 /** The normalization type used for the normalization layer */
@@ -1690,6 +1714,44 @@ class NormalizationLayerInfo
     bool     _is_scaled;
 };
 
+class StridedSliceLayerInfo
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     */
+    StridedSliceLayerInfo(int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0)
+        : _begin_mask(begin_mask), _end_mask(end_mask), _shrink_axis_mask(shrink_axis_mask)
+    {
+    }
+
+    /* Get the begin mask value */
+    int32_t begin_mask() const
+    {
+        return _begin_mask;
+    }
+
+    /* Get the end mask value */
+    int32_t end_mask() const
+    {
+        return _end_mask;
+    }
+
+    /* Get the shrink axis mask value */
+    int32_t shrink_axis_mask() const
+    {
+        return _shrink_axis_mask;
+    }
+
+private:
+    int32_t _begin_mask;
+    int32_t _end_mask;
+    int32_t _shrink_axis_mask;
+};
+
 /** Convolution Layer Weights Information class. This class stores the necessary information to compute convolution layer when the weights are already reshaped */
 class WeightsInfo
 {
@@ -1741,11 +1803,11 @@ class WeightsInfo
     }
 
 private:
-    const bool         _are_reshaped;
-    const unsigned int _kernel_width;
-    const unsigned int _kernel_height;
-    const unsigned int _num_kernels;
-    const bool         _retain_internal_weights;
+    bool         _are_reshaped;
+    unsigned int _kernel_width;
+    unsigned int _kernel_height;
+    unsigned int _num_kernels;
+    bool         _retain_internal_weights;
 };
 
 /** GEMM reshape information class. This class stores the necessary information about matrix A and matrix B reshape.
@@ -1852,14 +1914,14 @@ class GEMMReshapeInfo final
     };
 
 private:
-    const int  _m;
-    const int  _n;
-    const int  _k;
-    const int  _mult_transpose1xW_width;
-    const int  _mult_interleave4x4_height;
-    const int  _depth_output_gemm3d;
-    const bool _reinterpret_input_as_3d;
-    const bool _broadcast_bias;
+    int  _m;
+    int  _n;
+    int  _k;
+    int  _mult_transpose1xW_width;
+    int  _mult_interleave4x4_height;
+    int  _depth_output_gemm3d;
+    bool _reinterpret_input_as_3d;
+    bool _broadcast_bias;
 };
 
 struct DepthwiseConvolutionReshapeInfo
@@ -2186,5 +2248,14 @@ struct IOFormatInfo
     /** Align columns */
     bool align_columns;
 };
+
+/** Internal keypoint class for Lucas-Kanade Optical Flow */
+struct NELKInternalKeypoint
+{
+    float x{ 0.f };                 /**< x coordinate of the keypoint */
+    float y{ 0.f };                 /**< y coordinate of the keypoint */
+    bool  tracking_status{ false }; /**< the tracking status of the keypoint */
+};
+
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_TYPES_H */
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 7a1cc99127..e2c1ba938b 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -38,11 +38,15 @@
 #include <sstream>
 #include <string>
 #include <type_traits>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
 namespace arm_compute
 {
+class ITensor;
+class ITensorInfo;
+
 /** Calculate the rounded up quotient of val / m.
  *
  * @param[in] val Value to divide and round up.
@@ -747,13 +751,13 @@ inline TensorShape adjust_odd_shape(const TensorShape &shape, Format format)
     // Force width to be even for formats which require subsampling of the U and V channels
     if(has_format_horizontal_subsampling(format))
     {
-        output.set(0, output.x() & ~1U);
+        output.set(0, (output.x() + 1) & ~1U);
     }
 
     // Force height to be even for formats which require subsampling of the U and V channels
     if(has_format_vertical_subsampling(format))
     {
-        output.set(1, output.y() & ~1U);
+        output.set(1, (output.y() + 1) & ~1U);
     }
 
     return output;
@@ -1084,6 +1088,49 @@ const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType out
  * @return String representation of the PixelValue through the given data type.
  */
 std::string string_from_pixel_value(const PixelValue &value, const DataType data_type);
+/** Convert a string to DataType
+ *
+ * @param[in] name The name of the data type
+ *
+ * @return DataType
+ */
+DataType data_type_from_name(const std::string &name);
+/** Stores padding information before configuring a kernel
+ *
+ * @param[in] infos list of tensor infos to store the padding info for
+ *
+ * @return An unordered map where each tensor info pointer is paired with its original padding info
+ */
+std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initializer_list<const ITensorInfo *> infos);
+/** Stores padding information before configuring a kernel
+ *
+ * @param[in] tensors list of tensors to store the padding info for
+ *
+ * @return An unordered map where each tensor info pointer is paired with its original padding info
+ */
+std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initializer_list<const ITensor *> tensors);
+/** Check if the previously stored padding info has changed after configuring a kernel
+ *
+ * @param[in] padding_map an unordered map where each tensor info pointer is paired with its original padding info
+ *
+ * @return true if any of the tensor infos has changed its paddings
+ */
+bool has_padding_changed(const std::unordered_map<const ITensorInfo *, PaddingSize> &padding_map);
+
+/** Input Stream operator for @ref DataType
+ *
+ * @param[in]  stream    Stream to parse
+ * @param[out] data_type Output data type
+ *
+ * @return Updated stream
+ */
+inline ::std::istream &operator>>(::std::istream &stream, DataType &data_type)
+{
+    std::string value;
+    stream >> value;
+    data_type = data_type_from_name(value);
+    return stream;
+}
 /** Lower a given string.
  *
  * @param[in] val Given string to lower.
@@ -1301,6 +1348,30 @@ bool check_value_range(T val, DataType dt, QuantizationInfo qinfo = Quantization
     }
 }
 
+/** Returns the adjusted vector size in case it is less than the input's first dimension, getting rounded down to its closest valid vector size
+ *
+ * @param[in] vec_size vector size to be adjusted
+ * @param[in] dim0     size of the first dimension
+ *
+ * @return the number of element processed along the X axis per thread
+ */
+inline unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
+{
+    ARM_COMPUTE_ERROR_ON(vec_size > 16);
+
+    if((vec_size >= dim0) && (dim0 == 3))
+    {
+        return dim0;
+    }
+
+    while(vec_size > dim0)
+    {
+        vec_size >>= 1;
+    }
+
+    return vec_size;
+}
+
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
 /** Print consecutive elements to an output stream.
  *
diff --git a/arm_compute/core/Version.h b/arm_compute/core/Version.h
index 3a2c783223..a4d307950a 100644
--- a/arm_compute/core/Version.h
+++ b/arm_compute/core/Version.h
@@ -27,12 +27,12 @@
 #include <string>
 
 /* Macro utilities */
-#define STRINGIFY2(s) #s
-#define STRINGIFY(s) STRINGIFY2(s)
+#define ARM_COMPUTE_STRINGIFY2(s) #s
+#define ARM_COMPUTE_STRINGIFY(s) ARM_COMPUTE_STRINGIFY2(s)
 
-#define ARM_COMPUTE_VERSION_STR          \
-    STRINGIFY(ARM_COMPUTE_VERSION_MAJOR) \
-    "." STRINGIFY(ARM_COMPUTE_VERSION_MINOR) "." STRINGIFY(ARM_COMPUTE_VERSION_PATCH)
+#define ARM_COMPUTE_VERSION_STR                      \
+    ARM_COMPUTE_STRINGIFY(ARM_COMPUTE_VERSION_MAJOR) \
+    "." ARM_COMPUTE_STRINGIFY(ARM_COMPUTE_VERSION_MINOR) "." ARM_COMPUTE_STRINGIFY(ARM_COMPUTE_VERSION_PATCH)
 
 namespace arm_compute
 {
@@ -45,4 +45,7 @@ namespace arm_compute
 std::string build_information();
 } // namespace arm_compute
 
+#undef ARM_COMPUTE_STRINGIFY
+#undef ARM_COMPUTE_STRINGIFY2
+
 #endif /* ARM_COMPUTE_LIBRARY_VERSION_H */
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index 2ba5440c68..150320a90e 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,6 +45,8 @@ class Window
     static constexpr size_t DimY = 1;
     /** Alias for dimension 2 also known as Z dimension */
     static constexpr size_t DimZ = 2;
+    /** Alias for dimension 3 also known as W dimension */
+    static constexpr size_t DimW = 3;
 
     /** Default constructor: create a window containing a single element. */
     constexpr Window()
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
index 14a432a0c0..6100d09a1c 100644
--- a/arm_compute/core/Window.inl
+++ b/arm_compute/core/Window.inl
@@ -197,15 +197,15 @@ inline Window Window::split_window(size_t dimension, size_t id, size_t total) co
     {
         if(d == dimension)
         {
-            int start        = _dims[d].start();
-            int end          = _dims[d].end();
-            const int step   = _dims[d].step();
+            int       start = _dims[d].start();
+            int       end   = _dims[d].end();
+            const int step  = _dims[d].step();
 
             const int num_it = num_iterations(d);
             const int rem    = num_it % total;
-            int work         = num_it / total;
+            int       work   = num_it / total;
 
-            int it_start     = work * id;
+            int it_start = work * id;
 
             if(int(id) < rem)
             {
@@ -277,7 +277,7 @@ inline void Window::use_tensor_dimensions(const TensorShape &shape, size_t first
 {
     for(unsigned int n = first_dimension; n < shape.num_dimensions(); ++n)
     {
-        set(n, Window::Dimension(0, std::max(shape[n], static_cast<uint32_t>(1))));
+        set(n, Window::Dimension(0, std::max(shape[n], static_cast<size_t>(1))));
     }
 }
 
diff --git a/arm_compute/core/utils/math/SafeOps.h b/arm_compute/core/utils/math/SafeOps.h
index 4f81cf4b44..c222c65e84 100644
--- a/arm_compute/core/utils/math/SafeOps.h
+++ b/arm_compute/core/utils/math/SafeOps.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,9 @@
 #define ARM_COMPUTE_UTILS_MATH_SAFE_OPS
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/Requires.h"
+#include "support/Requires.h"
+
+#include <limits>
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 0be4caf2b5..5ed8aea277 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -47,13 +47,13 @@ namespace shape_calculator
  *
  * @return the calculated shape
  */
-inline TensorShape calculate_reduce_mean_shape(ITensor *input, const Coordinates &reduction_axis, bool keep_dims)
+inline TensorShape calculate_reduce_mean_shape(ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims)
 {
     const int   reduction_ops = reduction_axis.num_dimensions();
     Coordinates axis_local    = reduction_axis;
-    const int   input_dims    = input->info()->num_dimensions();
+    const int   input_dims    = input->num_dimensions();
     convert_negative_axis(axis_local, input_dims);
-    TensorShape out_shape = input->info()->tensor_shape();
+    TensorShape out_shape = input->tensor_shape();
     // Configure reshape layer if we want to drop the dimensions
     if(!keep_dims)
     {
@@ -1083,24 +1083,24 @@ inline TensorShape compute_batch_to_space_shape(const ITensorInfo *input, const
 
 /** Calculate the depth to space output shape of a tensor
  *
- * @param[in] input Input tensor info
- * @param[in] block Block shape value
+ * @param[in] input_shape Input tensor shape
+ * @param[in] data_layout Operation data layout
+ * @param[in] block       Block shape value
  *
  * @return the calculated shape
  */
-inline TensorShape compute_depth_to_space_shape(const ITensorInfo *input, int block)
+inline TensorShape compute_depth_to_space_shape(const TensorShape &input_shape, DataLayout data_layout, int block)
 {
     ARM_COMPUTE_ERROR_ON(block < 2);
 
-    const DataLayout data_layout = input->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(idx_width, input->dimension(idx_width) * block);
-    output_shape.set(idx_height, input->dimension(idx_height) * block);
-    output_shape.set(idx_channel, input->dimension(idx_channel) / (block * block));
+    TensorShape output_shape{ input_shape };
+    output_shape.set(idx_width, input_shape[idx_width] * block);
+    output_shape.set(idx_height, input_shape[idx_height] * block);
+    output_shape.set(idx_channel, input_shape[idx_channel] / (block * block));
 
     return output_shape;
 }
@@ -1157,9 +1157,12 @@ inline TensorShape compute_space_to_batch_shape(const ITensorInfo *input, const
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
-    output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_x + padding_left.x() + padding_right.x());
-    output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_y + padding_left.y() + padding_right.y());
-    output_shape.set(idx_batch, input->tensor_shape()[idx_batch] / (block_x * block_y));
+    ARM_COMPUTE_ERROR_ON((input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) % block_x != 0);
+    ARM_COMPUTE_ERROR_ON((input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) % block_y != 0);
+
+    output_shape.set(idx_width, (input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) / block_x);
+    output_shape.set(idx_height, (input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) / block_y);
+    output_shape.set(idx_batch, input->tensor_shape()[idx_batch] * block_x * block_y);
 
     return output_shape;
 }
diff --git a/arm_compute/core/utils/misc/Traits.h b/arm_compute/core/utils/misc/Traits.h
index 58fb1bff59..933922f63c 100644
--- a/arm_compute/core/utils/misc/Traits.h
+++ b/arm_compute/core/utils/misc/Traits.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_UTILS_TRAITS_TRAITS_H
 #define ARM_COMPUTE_UTILS_TRAITS_TRAITS_H
 
+#include "arm_compute/core/Types.h"
 #include <type_traits>
 
 namespace arm_compute
diff --git a/arm_compute/graph/GraphBuilder.h b/arm_compute/graph/GraphBuilder.h
index bce1ce4c40..f29db3f081 100644
--- a/arm_compute/graph/GraphBuilder.h
+++ b/arm_compute/graph/GraphBuilder.h
@@ -84,6 +84,21 @@ class GraphBuilder final
      */
     static NodeID add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info,
                                       const QuantizationInfo &out_quant_info = QuantizationInfo());
+    /** Adds an activation layer node to the graph
+     *
+     * @param[in] g              Graph to add the node to
+     * @param[in] params         Common node parameters
+     * @param[in] input          Input to the activation layer node as a NodeID-Index pair
+     * @param[in] op             Reduction Operation: min or max
+     * @param[in] axis           Axis to perform reduction operation across
+     * @param[in] out_data_type  (Optional) Output data type
+     * @param[in] out_quant_info (Optional) Output quantization info
+     *
+     * @return Node ID of the created node, EmptyNodeID in case of error
+     */
+    static NodeID add_arg_min_max_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, unsigned int axis,
+                                       DataType                out_data_type  = DataType::UNKNOWN,
+                                       const QuantizationInfo &out_quant_info = QuantizationInfo());
     /** Adds a batch normalization layer node to the graph
      *
      * @param[in] g              Graph to add the node to
@@ -173,6 +188,16 @@ class GraphBuilder final
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
     static NodeID add_concatenate_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, const descriptors::ConcatLayerDescriptor &concat_descriptor);
+    /** Adds an depth to space layer node to the graph
+     *
+     * @param[in] g           Graph to add the node to
+     * @param[in] params      Common node parameters
+     * @param[in] input       Input to the depth to space layer node as a NodeID-Index pair
+     * @param[in] block_shape Block shape to reshape tensor with
+     *
+     * @return Node ID of the created node, EmptyNodeID in case of error
+     */
+    static NodeID add_depth_to_space_node(Graph &g, NodeParams params, NodeIdxPair input, int32_t block_shape);
     /** Adds a depth-wise convolution layer node to the graph
      *
      * @param[in] g                     Graph to add the node to
@@ -311,6 +336,17 @@ class GraphBuilder final
      */
     static NodeID add_generate_proposals_node(Graph &g, NodeParams params, NodeIdxPair scores, NodeIdxPair deltas,
                                               NodeIdxPair anchors, GenerateProposalsInfo info);
+    /** Adds a L2 Normalize layer node to the graph
+     *
+     * @param[in] g       Graph to add the node to
+     * @param[in] params  Common node parameters
+     * @param[in] input   Input to the normalization layer node as a NodeID-Index pair
+     * @param[in] axis    Axis to perform normalization on
+     * @param[in] epsilon Lower bound value for the normalization
+     *
+     * @return Node ID of the created node, EmptyNodeID in case of error
+     */
+    static NodeID add_l2_normalize_node(Graph &g, NodeParams params, NodeIdxPair input, int axis, float epsilon);
     /** Adds a normalization layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -411,6 +447,18 @@ class GraphBuilder final
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
     static NodeID add_quantization_node(Graph &g, NodeParams params, NodeIdxPair input, const QuantizationInfo &out_quant_info);
+    /** Adds a reduction sum layer node to the graph
+     *
+     * @param[in] g         Graph to add the node to
+     * @param[in] params    Common node parameters
+     * @param[in] input     Input to the reorg layer node as a NodeID-Index pair
+     * @param[in] op        Reduction operation
+     * @param[in] axis      Reduction axis
+     * @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
+     *
+     * @return Node ID of the created node, EmptyNodeID in case of error
+     */
+    static NodeID add_reduction_operation_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims = true);
     /** Adds a reorg layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -510,6 +558,19 @@ class GraphBuilder final
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
     static NodeID add_stack_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, int axis);
+    /** Adds a strided slice node to the graph
+     *
+     * @param[in] g       Graph to add the node to
+     * @param[in] params  Common node parameters
+     * @param[in] input   Input to the strided slice layer node as a NodeID-Index pair
+     * @param[in] starts  The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends    The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] info    Contains masks for the starts, ends and strides
+     *
+     * @return Node ID of the created node, EmptyNodeID in case of error
+     */
+    static NodeID add_strided_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends, BiStrides &strides, StridedSliceLayerInfo info);
     /** Adds an upsample layer to the graph
      *
      * @param[in] g                 Graph to add the node to
diff --git a/arm_compute/graph/TensorDescriptor.h b/arm_compute/graph/TensorDescriptor.h
index 6c6f99d69c..de67289bc8 100644
--- a/arm_compute/graph/TensorDescriptor.h
+++ b/arm_compute/graph/TensorDescriptor.h
@@ -26,7 +26,7 @@
 
 #include "arm_compute/graph/Types.h"
 
-#include "arm_compute/core/utils/misc/ICloneable.h"
+#include "support/ICloneable.h"
 #include "support/MemorySupport.h"
 
 #include <memory>
diff --git a/arm_compute/graph/TypeLoader.h b/arm_compute/graph/TypeLoader.h
index a53af40f77..286bfebeb5 100644
--- a/arm_compute/graph/TypeLoader.h
+++ b/arm_compute/graph/TypeLoader.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,29 +30,6 @@
 
 namespace arm_compute
 {
-/** Converts a string to a strong types enumeration @ref DataType
- *
- * @param[in] name String to convert
- *
- * @return Converted DataType enumeration
- */
-arm_compute::DataType data_type_from_name(const std::string &name);
-
-/** Input Stream operator for @ref DataType
- *
- * @param[in]  stream    Stream to parse
- * @param[out] data_type Output data type
- *
- * @return Updated stream
- */
-inline ::std::istream &operator>>(::std::istream &stream, arm_compute::DataType &data_type)
-{
-    std::string value;
-    stream >> value;
-    data_type = data_type_from_name(value);
-    return stream;
-}
-
 /** Converts a string to a strong types enumeration @ref DataLayout
  *
  * @param[in] name String to convert
diff --git a/arm_compute/graph/TypePrinter.h b/arm_compute/graph/TypePrinter.h
index 5d6c9f308d..e8024980c1 100644
--- a/arm_compute/graph/TypePrinter.h
+++ b/arm_compute/graph/TypePrinter.h
@@ -65,6 +65,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::ActivationLayer:
             os << "ActivationLayer";
             break;
+        case NodeType::ArgMinMaxLayer:
+            os << "ArgMinMaxLayer";
+            break;
         case NodeType::BatchNormalizationLayer:
             os << "BatchNormalizationLayer";
             break;
@@ -83,6 +86,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::DeconvolutionLayer:
             os << "DeconvolutionLayer";
             break;
+        case NodeType::DepthToSpaceLayer:
+            os << "DepthToSpaceLayer";
+            break;
         case NodeType::DequantizationLayer:
             os << "DequantizationLayer";
             break;
@@ -116,6 +122,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::GenerateProposalsLayer:
             os << "GenerateProposalsLayer";
             break;
+        case NodeType::L2NormalizeLayer:
+            os << "L2NormalizeLayer";
+            break;
         case NodeType::NormalizationLayer:
             os << "NormalizationLayer";
             break;
@@ -143,6 +152,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::QuantizationLayer:
             os << "QuantizationLayer";
             break;
+        case NodeType::ReductionOperationLayer:
+            os << "ReductionOperationLayer";
+            break;
         case NodeType::ReorgLayer:
             os << "ReorgLayer";
             break;
@@ -167,6 +179,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::StackLayer:
             os << "StackLayer";
             break;
+        case NodeType::StridedSliceLayer:
+            os << "StridedSliceLayer";
+            break;
         case NodeType::UpsampleLayer:
             os << "UpsampleLayer";
             break;
diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index 3a4d0a6070..5f851ac67e 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h
@@ -104,6 +104,7 @@ enum class EltwiseOperation
     Add, /**< Arithmetic addition */
     Sub, /**< Arithmetic subtraction */
     Mul, /**< Arithmetic multiplication */
+    Max, /**< Arithmetic maximum */
 };
 
 /** Supported Unary Element-wise operations */
@@ -140,12 +141,14 @@ enum class FastMathHint
 enum class NodeType
 {
     ActivationLayer,
+    ArgMinMaxLayer,
     BatchNormalizationLayer,
     BoundingBoxTransformLayer,
     ChannelShuffleLayer,
     ConcatenateLayer,
     ConvolutionLayer,
     DeconvolutionLayer,
+    DepthToSpaceLayer,
     DepthwiseConvolutionLayer,
     DequantizationLayer,
     DetectionOutputLayer,
@@ -156,6 +159,7 @@ enum class NodeType
     FusedConvolutionBatchNormalizationLayer,
     FusedDepthwiseConvolutionBatchNormalizationLayer,
     GenerateProposalsLayer,
+    L2NormalizeLayer,
     NormalizationLayer,
     NormalizePlanarYUVLayer,
     PadLayer,
@@ -165,6 +169,7 @@ enum class NodeType
     PrintLayer,
     PriorBoxLayer,
     QuantizationLayer,
+    ReductionOperationLayer,
     ReorgLayer,
     ReshapeLayer,
     ResizeLayer,
@@ -173,6 +178,7 @@ enum class NodeType
     SliceLayer,
     SplitLayer,
     StackLayer,
+    StridedSliceLayer,
     UpsampleLayer,
     UnaryEltwiseLayer,
     YOLOLayer,
diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index af748341a5..e2904af0b5 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h
@@ -37,7 +37,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
@@ -131,6 +131,43 @@ std::unique_ptr<IFunction> create_activation_layer(ActivationLayerNode &node)
     return RETURN_UNIQUE_PTR(func);
 }
 
+/** Creates a backend argminmax layer function
+ *
+ * @tparam ArgMinMaxLayerFunction Backend activation function
+ * @tparam TargetInfo             Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend argminmax layer function
+ */
+template <typename ArgMinMaxLayerFunction, typename TargetInfo>
+std::unique_ptr<IFunction> create_arg_min_max_layer(ArgMinMaxLayerNode &node)
+{
+    validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input  = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
+    const ReductionOperation         op     = node.reduction_operation();
+    unsigned int                     axis   = node.axis();
+
+    // Create function
+    auto func = support::cpp14::make_unique<ArgMinMaxLayerFunction>();
+    func->configure(input, axis, output, op);
+
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << node.type()
+                               << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type()
+                               << " Shape: " << input->info()->tensor_shape()
+                               << " Reduction Operation: " << op
+                               << " axis: " << axis
+                               << std::endl);
+
+    return RETURN_UNIQUE_PTR(func);
+}
+
 /** Create a backend batch normalization layer function
  *
  * @tparam BatchNormalizationLayerFunction Backend batch normalization function
@@ -614,6 +651,45 @@ std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvoluti
     return RETURN_UNIQUE_PTR(func);
 }
 
+/** Create a backend depth to space layer function
+ *
+ * @tparam DepthToSpaceLayerNode Function Backend depth to space function
+ * @tparam TargetInfo            Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth to space layer function
+ */
+template <typename DepthToSpaceLayerFunction, typename TargetInfo>
+std::unique_ptr<IFunction> create_depth_to_space_layer(DepthToSpaceLayerNode &node)
+{
+    validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input  = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
+
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<DepthToSpaceLayerFunction>();
+    func->configure(input, output, node.block_shape());
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << node.type()
+                               << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Block Size: " << node.block_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+
+    return RETURN_UNIQUE_PTR(func);
+}
+
 /** Create a backend dequantize layer function
  *
  * @tparam DequantizationLayer Function Backend dequantize function
@@ -798,6 +874,12 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
                                         std::string("PixelWiseMultiplication"),
                                         input1, input2, output, 1.f, convert_policy, node.rounding_policy(), act_info);
     }
+    else if(eltwise_op == EltwiseOperation::Max)
+    {
+        std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Maximum>(
+                                        std::string("ElementwiseMaximum"),
+                                        input1, input2, output, act_info);
+    }
     else
     {
         ARM_COMPUTE_ERROR("Unsupported element-wise operation!");
@@ -1007,6 +1089,50 @@ std::unique_ptr<IFunction> create_generate_proposals_layer(GenerateProposalsLaye
     return RETURN_UNIQUE_PTR(func);
 }
 
+/** Create a backend l2 normalization layer function
+ *
+ * @tparam NormalizationLayerFunction Backend normalization function
+ * @tparam TargetInfo                 Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ * @param[in] ctx  Graph context
+ *
+ * @return Backend normalization layer function
+ */
+template <typename L2NormalizeLayerFunction, typename TargetInfo>
+std::unique_ptr<IFunction> create_l2_normalize_layer(L2NormalizeLayerNode &node, GraphContext &ctx)
+{
+    validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input   = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *output  = get_backing_tensor<TargetInfo>(node.output(0));
+    int                              axis    = node.axis();
+    float                            epsilon = node.epsilon();
+
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto mm   = get_memory_manager(ctx, TargetInfo::TargetType);
+    auto func = support::cpp14::make_unique<L2NormalizeLayerFunction>(mm);
+    func->configure(input, output, axis, epsilon);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << node.type()
+                               << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << " Axis: " << axis
+                               << " Epsilon: " << epsilon
+                               << std::endl);
+
+    return RETURN_UNIQUE_PTR(func);
+}
+
 /** Create a backend normalization layer function
  *
  * @tparam NormalizationLayerFunction Backend normalization function
@@ -1352,6 +1478,50 @@ std::unique_ptr<IFunction> create_quantization_layer(QuantizationLayerNode &node
     return RETURN_UNIQUE_PTR(func);
 }
 
+/** Create a backend reduction operation layer function
+ *
+ * @tparam ReductionOperationFunction Backend reduction operation function
+ * @tparam TargetInfo                 Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ * @param[in] ctx  Graph context
+ *
+ * @return Backend reduction sum layer function
+ */
+template <typename ReductionOperationFunction, typename TargetInfo>
+std::unique_ptr<IFunction> create_reduction_operation_layer(ReductionLayerNode &node, GraphContext &ctx)
+{
+    validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input     = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *output    = get_backing_tensor<TargetInfo>(node.output(0));
+    ReductionOperation               op        = node.op();
+    int                              axis      = node.axis();
+    bool                             keep_dims = node.keep_dims();
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<ReductionOperationFunction>(get_memory_manager(ctx, TargetInfo::TargetType));
+    func->configure(input, output, axis, op, keep_dims);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << node.type()
+                               << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << " Operation: " << op
+                               << " Axis: " << axis
+                               << " Keep dimensions:" << keep_dims
+                               << std::endl);
+
+    return RETURN_UNIQUE_PTR(func);
+}
+
 /** Create a backend reorg layer function
  *
  * @tparam ReorgLayerFunction Backend reorg function
@@ -1628,6 +1798,49 @@ std::unique_ptr<arm_compute::IFunction> create_stack_layer(StackLayerNode &node)
 
     return RETURN_UNIQUE_PTR(func);
 }
+
+/** Create a backend slice layer function
+ *
+ * @tparam StridedSliceLayerFunction Backend strided slice function
+ * @tparam TargetInfo                Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend strided slice layer function
+ */
+template <typename StridedSliceLayerFunction, typename TargetInfo>
+std::unique_ptr<IFunction> create_strided_slice_layer(StridedSliceLayerNode &node)
+{
+    validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input   = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *output  = get_backing_tensor<TargetInfo>(node.output(0));
+    Coordinates                      starts  = node.starts();
+    Coordinates                      ends    = node.ends();
+    BiStrides                        strides = node.strides();
+    StridedSliceLayerInfo            info    = node.strided_slice_info();
+
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<StridedSliceLayerFunction>();
+    func->configure(input, output, starts, ends, strides, info.begin_mask(), info.end_mask(), info.shrink_axis_mask());
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << node.type()
+                               << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+
+    return RETURN_UNIQUE_PTR(func);
+}
+
 /** Create a backend Upsample layer function
  *
  * @tparam UpsampleLayerFunction Backend Upsample function
diff --git a/arm_compute/graph/backends/Utils.h b/arm_compute/graph/backends/Utils.h
index 0322ec5fab..7d67f3b9e3 100644
--- a/arm_compute/graph/backends/Utils.h
+++ b/arm_compute/graph/backends/Utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,7 @@ namespace backends
  * @return  A configured backend function
  */
 template <typename FunctionType, typename FunctionNameType, typename... ParameterType>
-std::pair<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_function(FunctionNameType name, ParameterType... args)
+std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_function(FunctionNameType name, ParameterType... args)
 {
     auto f = arm_compute::support::cpp14::make_unique<FunctionType>();
     f->configure(std::forward<ParameterType>(args)...);
@@ -58,9 +58,9 @@ std::pair<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_name
  * @return  A configured backend function
  */
 template <typename FunctionType, typename FunctionNameType, typename MemoryManagerType, typename... ParameterType>
-std::pair<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_memory_managed_function(FunctionNameType name,
-                                                                                                          MemoryManagerType mm,
-                                                                                                          ParameterType... args)
+std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_memory_managed_function(FunctionNameType name,
+                                                                                                           MemoryManagerType mm,
+                                                                                                           ParameterType... args)
 {
     auto f = arm_compute::support::cpp14::make_unique<FunctionType>(mm);
     f->configure(std::forward<ParameterType>(args)...);
diff --git a/arm_compute/graph/backends/ValidateHelpers.h b/arm_compute/graph/backends/ValidateHelpers.h
index c929983834..dd519fbd5e 100644
--- a/arm_compute/graph/backends/ValidateHelpers.h
+++ b/arm_compute/graph/backends/ValidateHelpers.h
@@ -52,6 +52,29 @@ inline arm_compute::ITensorInfo *get_backing_tensor_info(arm_compute::graph::Ten
     return ((tensor == nullptr) || (tensor->handle() == nullptr)) ? nullptr : tensor->handle()->tensor().info();
 }
 
+/** Validates a ArgMinMax layer node
+ *
+ * @tparam ArgMinMax layer function type
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename ArgMinMaxLayer>
+Status validate_arg_min_max_layer(ArgMinMaxLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ArgMinMaxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    arm_compute::ITensorInfo *input  = detail::get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo *output = get_backing_tensor_info(node.output(0));
+
+    // Validate function
+    return ArgMinMaxLayer::validate(input, node.axis(), output, node.reduction_operation());
+}
+
 /** Validates a Bounding Box Transform layer node
  *
  * @tparam BoundingBoxTransformLayer  Bounding Box Transform layer function type
@@ -199,6 +222,27 @@ Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
 
     return status;
 }
+/** Validates a depth to space layer node
+ *
+ * @tparam DequantizationLayer Dequantize layer type
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename DepthToSpaceLayer>
+Status validate_depth_to_space_layer(DepthToSpaceLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    arm_compute::ITensorInfo *input  = get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo *output = get_backing_tensor_info(node.output(0));
+
+    return DepthToSpaceLayer::validate(input, output, node.block_shape());
+}
 /** Validates a dequantize layer node
  *
  * @tparam DequantizationLayer Dequantize layer type
@@ -299,6 +343,31 @@ Status validate_generate_proposals_layer(GenerateProposalsLayerNode &node)
     return GenerateProposalsLayer::validate(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 }
 
+/** Validates a L2Normalization layer node
+ *
+ * @tparam L2Normalization layer type
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename L2NormalizeLayer>
+Status validate_l2_normalize_layer(L2NormalizeLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating L2NormalizeLayerNode node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    arm_compute::ITensorInfo *input   = detail::get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo *output  = get_backing_tensor_info(node.output(0));
+    int                       axis    = node.axis();
+    float                     epsilon = node.epsilon();
+
+    // Validate function
+    return L2NormalizeLayer::validate(input, output, axis, epsilon);
+}
+
 /** Validates a NormalizePlanarYUV layer node
  *
  * @tparam NormalizePlanarYUVLayer layer type
@@ -440,6 +509,30 @@ Status validate_quantization_layer(QuantizationLayerNode &node)
     return QuantizationLayer::validate(input, output);
 }
 
+/** Validates a Reduction operation layer node
+ *
+ * @tparam ReductionLayer Reduction layer type
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename ReductionLayer>
+Status validate_reduction_operation_layer(ReductionLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReductionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract input and output
+    arm_compute::ITensorInfo *input  = detail::get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo *output = get_backing_tensor_info(node.output(0));
+
+    // Validate function
+    return ReductionLayer::validate(input, output, node.axis(), node.op(), node.keep_dims());
+}
+
 /** Validates a Reorg layer node
  *
  * @tparam ReorgLayer Reorg layer type
@@ -535,6 +628,32 @@ Status validate_slice_layer(SliceLayerNode &node)
     return SliceLayer::validate(input, output, starts, ends);
 }
 
+/** Validates a Strided Slice layer node
+ *
+ * @tparam StridedSliceLayer Strided Slice layer function type
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename StridedSliceLayer>
+Status validate_strided_slice_layer(StridedSliceLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating StridedSlice node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    arm_compute::ITensorInfo   *input   = get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo   *output  = get_backing_tensor_info(node.output(0));
+    const Coordinates           starts  = node.starts();
+    const Coordinates           ends    = node.ends();
+    const BiStrides             strides = node.strides();
+    const StridedSliceLayerInfo info    = node.strided_slice_info();
+
+    return StridedSliceLayer::validate(input, output, starts, ends, strides, info.begin_mask(), info.end_mask(), info.shrink_axis_mask());
+}
+
 /** Validates a Upsample layer node
  *
  * @tparam UpsampleLayer Upsample layer type
@@ -601,7 +720,6 @@ Status validate_eltwise_Layer(EltwiseLayerNode &node)
     const RoundingPolicy            round_policy   = node.rounding_policy();
     const ActivationLayerInfo       act_info       = node.fused_activation();
     const QuantizationInfo          quant_info     = node.output_quant_info();
-    const float                     scale          = (quant_info.scale().empty()) ? 1.0f : quant_info.scale()[0];
 
     // Validate function
     if(eltwise_op == EltwiseOperation::Add)
@@ -614,7 +732,11 @@ Status validate_eltwise_Layer(EltwiseLayerNode &node)
     }
     else if(eltwise_op == EltwiseOperation::Mul)
     {
-        return EltwiseLayerFunctions::PixelWiseMultiplication::validate(input1, input2, output, scale, convert_policy, round_policy, act_info);
+        return EltwiseLayerFunctions::PixelWiseMultiplication::validate(input1, input2, output, 1.0f, convert_policy, round_policy, act_info);
+    }
+    else if(eltwise_op == EltwiseOperation::Max)
+    {
+        return EltwiseLayerFunctions::ElementwiseMax::validate(input1, input2, output, act_info);
     }
     else
     {
diff --git a/arm_compute/graph/frontend/Layers.h b/arm_compute/graph/frontend/Layers.h
index 6aeebb45ec..74c40126c8 100644
--- a/arm_compute/graph/frontend/Layers.h
+++ b/arm_compute/graph/frontend/Layers.h
@@ -145,6 +145,48 @@ class ActivationLayer final : public ILayer
     const QuantizationInfo _out_quant_info;
 };
 
+/** ArgMinMax Layer */
+class ArgMinMaxLayer final : public ILayer
+{
+public:
+    /** Construct an activation layer.
+     *
+     * @param[in] op             Reduction Operation: min or max
+     * @param[in] axis           Axis to perform reduction along
+     * @param[in] out_data_type  (Optional) Output tensor data type
+     * @param[in] out_quant_info (Optional) Output quantization info
+     */
+    ArgMinMaxLayer(ReductionOperation     op,
+                   unsigned int           axis,
+                   DataType               out_data_type  = DataType::UNKNOWN,
+                   const QuantizationInfo out_quant_info = QuantizationInfo())
+        : _op(op),
+          _axis(axis),
+          _out_data_type(out_data_type),
+          _out_quant_info(std::move(out_quant_info))
+    {
+    }
+
+    /** Create layer and add to the given stream.
+     *
+     * @param[in] s Stream to add layer to.
+     *
+     * @return ID of the created node.
+     */
+    NodeID create_layer(IStream &s) override
+    {
+        NodeParams  common_params = { name(), s.hints().target_hint };
+        NodeIdxPair input         = { s.tail_node(), 0 };
+        return GraphBuilder::add_arg_min_max_node(s.graph(), common_params, input, _op, _axis, _out_data_type, std::move(_out_quant_info));
+    }
+
+private:
+    ReductionOperation _op;
+    unsigned int       _axis;
+    DataType           _out_data_type;
+    QuantizationInfo   _out_quant_info;
+};
+
 /** Batchnormalization Layer */
 class BatchNormalizationLayer final : public ILayer
 {
@@ -489,6 +531,31 @@ class DepthwiseConvolutionLayer final : public ILayer
     const QuantizationInfo _weights_quant_info;
     const QuantizationInfo _out_quant_info;
 };
+
+/** DepthToSpace Layer */
+class DepthToSpaceLayer final : public ILayer
+{
+public:
+    /** Construct an DepthToSpace layer.
+     *
+     * @param[in] block_shape Block size to rearranged
+     */
+    DepthToSpaceLayer(int32_t block_shape)
+        : _block_shape(block_shape)
+    {
+    }
+
+    NodeID create_layer(IStream &s) override
+    {
+        NodeParams  common_params = { name(), s.hints().target_hint };
+        NodeIdxPair input         = { s.tail_node(), 0 };
+        return GraphBuilder::add_depth_to_space_node(s.graph(), common_params, input, _block_shape);
+    }
+
+private:
+    int32_t _block_shape;
+};
+
 /** Dequantization Layer */
 class DequantizationLayer final : public ILayer
 {
@@ -771,6 +838,32 @@ class GenerateProposalsLayer final : public ILayer
     GenerateProposalsInfo _info;
 };
 
+/** L2 Normalize Layer */
+class L2NormalizeLayer final : public ILayer
+{
+public:
+    /** Construct a L2 Normalize layer.
+     *
+     * @param[in] axis    Axis to perform normalization on
+     * @param[in] epsilon Lower bound value for the normalization
+     */
+    L2NormalizeLayer(int axis, float epsilon)
+        : _axis(axis), _epsilon(epsilon)
+    {
+    }
+
+    NodeID create_layer(IStream &s) override
+    {
+        NodeParams  common_params = { name(), s.hints().target_hint };
+        NodeIdxPair input         = { s.tail_node(), 0 };
+        return GraphBuilder::add_l2_normalize_node(s.graph(), common_params, input, _axis, _epsilon);
+    }
+
+private:
+    int   _axis;
+    float _epsilon;
+};
+
 /** Normalization Layer */
 class NormalizationLayer final : public ILayer
 {
@@ -1040,6 +1133,34 @@ class QuantizationLayer final : public ILayer
     QuantizationInfo _out_quant_info;
 };
 
+/** Reduction Layer */
+class ReductionLayer final : public ILayer
+{
+public:
+    /** Construct a reduction layer.
+     *
+     * @param[in] op        Reduction operation
+     * @param[in] axis      Reduction axis
+     * @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
+     */
+    ReductionLayer(ReductionOperation op, unsigned int axis, bool keep_dims)
+        : _op(op), _axis(axis), _keep_dims(keep_dims)
+    {
+    }
+
+    NodeID create_layer(IStream &s) override
+    {
+        NodeParams  common_params = { name(), s.hints().target_hint };
+        NodeIdxPair input         = { s.tail_node(), 0 };
+        return GraphBuilder::add_reduction_operation_node(s.graph(), common_params, input, _op, _axis, _keep_dims);
+    }
+
+private:
+    ReductionOperation _op;
+    unsigned int       _axis;
+    bool               _keep_dims;
+};
+
 /** Reorg Layer */
 class ReorgLayer final : public ILayer
 {
@@ -1308,6 +1429,36 @@ class StackLayer final : public ILayer
     int                                     _axis;
 };
 
+/** StridedSlice Layer */
+class StridedSliceLayer final : public ILayer
+{
+public:
+    /** Construct a strided slice layer.
+     *
+     * @param[in] starts             The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends               The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides            The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strided_slice_info Contains masks for the starts, ends and strides
+     */
+    StridedSliceLayer(Coordinates &starts, Coordinates &ends, BiStrides &strides, StridedSliceLayerInfo strided_slice_info)
+        : _starts(starts), _ends(ends), _strides(strides), _info(strided_slice_info)
+    {
+    }
+
+    NodeID create_layer(IStream &s) override
+    {
+        NodeParams  common_params = { name(), s.hints().target_hint };
+        NodeIdxPair input         = { s.tail_node(), 0 };
+        return GraphBuilder::add_strided_slice_node(s.graph(), common_params, input, _starts, _ends, _strides, _info);
+    }
+
+private:
+    Coordinates           _starts;
+    Coordinates           _ends;
+    BiStrides             _strides;
+    StridedSliceLayerInfo _info;
+};
+
 /** Upsample Layer */
 class UpsampleLayer final : public ILayer
 {
diff --git a/arm_compute/graph/nodes/ArgMinMaxLayerNode.h b/arm_compute/graph/nodes/ArgMinMaxLayerNode.h
new file mode 100644
index 0000000000..69191add99
--- /dev/null
+++ b/arm_compute/graph/nodes/ArgMinMaxLayerNode.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_GRAPH_ARGMINMAX_LAYER_NODE_H
+#define ARM_COMPUTE_GRAPH_ARGMINMAX_LAYER_NODE_H
+
+#include "arm_compute/graph/INode.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Arg Min/Max Layer node */
+class ArgMinMaxLayerNode final : public INode
+{
+public:
+    /** Constructor
+     *
+     * @param[in] op             Operation to perform: min or max
+     * @param[in] axis           Axis along which to reduce. Supported reduction axis : 0,1,2,3
+     * @param[in] out_data_type  (Optional) Output data type
+     * @param[in] out_quant_info (Optional) Output quantization info
+     */
+    ArgMinMaxLayerNode(ReductionOperation op,
+                       unsigned int       axis,
+                       DataType           out_data_type  = DataType::UNKNOWN,
+                       QuantizationInfo   out_quant_info = QuantizationInfo());
+    /** Operator accessor
+     *
+     * @return The operator the layer performs: min or max
+     */
+    ReductionOperation reduction_operation() const;
+    /** Axis accessor
+     *
+     * @return The axis along which the reduction is operating
+     */
+    unsigned int axis() const;
+    /** Output data type accessor
+     *
+     * @return The output data type
+     */
+    DataType out_data_type() const;
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void accept(INodeVisitor &v) override;
+
+public:
+    static constexpr NodeType node_type = NodeType::ArgMinMaxLayer;
+
+private:
+    ReductionOperation _op;
+    unsigned int       _axis;
+    DataType           _out_data_type;
+    QuantizationInfo   _out_quant_info;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_GRAPH_ARGMINMAX_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/DepthToSpaceLayerNode.h b/arm_compute/graph/nodes/DepthToSpaceLayerNode.h
new file mode 100644
index 0000000000..25e30e2c67
--- /dev/null
+++ b/arm_compute/graph/nodes/DepthToSpaceLayerNode.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_GRAPH_DEPTH_TO_SPACE_LAYER_NODE_H
+#define ARM_COMPUTE_GRAPH_DEPTH_TO_SPACE_LAYER_NODE_H
+
+#include "arm_compute/graph/INode.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** DepthToSpace Layer node */
+class DepthToSpaceLayerNode final : public INode
+{
+public:
+    /** Default Constructor */
+    DepthToSpaceLayerNode(int block_shape);
+    /** Block shape policy accessor
+     *
+     * @return Block shape
+     */
+    int block_shape() const;
+    /** Computes depth to space output descriptor
+     *
+     * @warning block_shape must be greater than or equal to 2
+     *
+     * @param[in] input_descriptor Input descriptor
+     * @param[in] block_shape      Number of output neurons
+     *
+     * @return Output descriptor
+     */
+    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor, int block_shape);
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void accept(INodeVisitor &v) override;
+
+private:
+    int _block_shape;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_GRAPH_DEPTH_TO_SPACE_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/L2NormalizeLayerNode.h b/arm_compute/graph/nodes/L2NormalizeLayerNode.h
new file mode 100644
index 0000000000..8edc5b0bf3
--- /dev/null
+++ b/arm_compute/graph/nodes/L2NormalizeLayerNode.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_GRAPH_L2_NORMALIZE_LAYER_NODE_H
+#define ARM_COMPUTE_GRAPH_L2_NORMALIZE_LAYER_NODE_H
+
+#include "arm_compute/graph/INode.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** L2Normalize Layer node */
+class L2NormalizeLayerNode final : public INode
+{
+public:
+    /** Constructor
+     *
+     */
+    L2NormalizeLayerNode();
+
+    /** Constructor
+     *
+     * @param[in] axis Axis to perform normalization on
+     */
+    L2NormalizeLayerNode(int axis);
+
+    /** Constructor
+     *
+     * @param[in] axis    Axis to perform normalization on
+     * @param[in] epsilon Lower bound value for the normalization
+     */
+    L2NormalizeLayerNode(int axis, float epsilon);
+
+    /** axis accessors
+     *
+     * @return Axis to perform normalization on
+     */
+    int axis() const;
+
+    /** epsilon accessors
+     *
+     * @return Lower bound value for the normalization
+     */
+    float epsilon() const;
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void accept(INodeVisitor &v) override;
+
+private:
+    int   _axis;
+    float _epsilon;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_GRAPH_L2_NORMALIZE_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/Nodes.h b/arm_compute/graph/nodes/Nodes.h
index bf4ab87cda..a6c569af88 100644
--- a/arm_compute/graph/nodes/Nodes.h
+++ b/arm_compute/graph/nodes/Nodes.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_GRAPH_NODES_H
 
 #include "arm_compute/graph/nodes/ActivationLayerNode.h"
+#include "arm_compute/graph/nodes/ArgMinMaxLayerNode.h"
 #include "arm_compute/graph/nodes/BatchNormalizationLayerNode.h"
 #include "arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h"
 #include "arm_compute/graph/nodes/ChannelShuffleLayerNode.h"
@@ -32,6 +33,7 @@
 #include "arm_compute/graph/nodes/ConstNode.h"
 #include "arm_compute/graph/nodes/ConvolutionLayerNode.h"
 #include "arm_compute/graph/nodes/DeconvolutionLayerNode.h"
+#include "arm_compute/graph/nodes/DepthToSpaceLayerNode.h"
 #include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h"
 #include "arm_compute/graph/nodes/DequantizationLayerNode.h"
 #include "arm_compute/graph/nodes/DetectionOutputLayerNode.h"
@@ -44,6 +46,7 @@
 #include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h"
 #include "arm_compute/graph/nodes/GenerateProposalsLayerNode.h"
 #include "arm_compute/graph/nodes/InputNode.h"
+#include "arm_compute/graph/nodes/L2NormalizeLayerNode.h"
 #include "arm_compute/graph/nodes/NormalizationLayerNode.h"
 #include "arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h"
 #include "arm_compute/graph/nodes/OutputNode.h"
@@ -55,6 +58,7 @@
 #include "arm_compute/graph/nodes/PriorBoxLayerNode.h"
 #include "arm_compute/graph/nodes/QuantizationLayerNode.h"
 #include "arm_compute/graph/nodes/ROIAlignLayerNode.h"
+#include "arm_compute/graph/nodes/ReductionLayerNode.h"
 #include "arm_compute/graph/nodes/ReorgLayerNode.h"
 #include "arm_compute/graph/nodes/ReshapeLayerNode.h"
 #include "arm_compute/graph/nodes/ResizeLayerNode.h"
@@ -62,6 +66,7 @@
 #include "arm_compute/graph/nodes/SoftmaxLayerNode.h"
 #include "arm_compute/graph/nodes/SplitLayerNode.h"
 #include "arm_compute/graph/nodes/StackLayerNode.h"
+#include "arm_compute/graph/nodes/StridedSliceLayerNode.h"
 #include "arm_compute/graph/nodes/UpsampleLayerNode.h"
 #include "arm_compute/graph/nodes/YOLOLayerNode.h"
 
diff --git a/arm_compute/graph/nodes/NodesFwd.h b/arm_compute/graph/nodes/NodesFwd.h
index 9541f4babe..cf8fc4f37c 100644
--- a/arm_compute/graph/nodes/NodesFwd.h
+++ b/arm_compute/graph/nodes/NodesFwd.h
@@ -31,6 +31,7 @@ namespace graph
 // Forward declarations
 class INode;
 class ActivationLayerNode;
+class ArgMinMaxLayerNode;
 class BatchNormalizationLayerNode;
 class BoundingBoxTransformLayerNode;
 class ChannelShuffleLayerNode;
@@ -38,6 +39,7 @@ class ConcatenateLayerNode;
 class ConstNode;
 class ConvolutionLayerNode;
 class DeconvolutionLayerNode;
+class DepthToSpaceLayerNode;
 class DepthwiseConvolutionLayerNode;
 class DequantizationLayerNode;
 class DetectionOutputLayerNode;
@@ -50,6 +52,7 @@ class FusedConvolutionBatchNormalizationNode;
 class FusedDepthwiseConvolutionBatchNormalizationNode;
 class GenerateProposalsLayerNode;
 class InputNode;
+class L2NormalizeLayerNode;
 class NormalizationLayerNode;
 class NormalizePlanarYUVLayerNode;
 class OutputNode;
@@ -60,6 +63,7 @@ class PReluLayerNode;
 class PrintLayerNode;
 class PriorBoxLayerNode;
 class QuantizationLayerNode;
+class ReductionLayerNode;
 class ReorgLayerNode;
 class ReshapeLayerNode;
 class ResizeLayerNode;
@@ -68,6 +72,7 @@ class SoftmaxLayerNode;
 class SliceLayerNode;
 class SplitLayerNode;
 class StackLayerNode;
+class StridedSliceLayerNode;
 class UpsampleLayerNode;
 class YOLOLayerNode;
 } // namespace graph
diff --git a/arm_compute/graph/nodes/QuantizationLayerNode.h b/arm_compute/graph/nodes/QuantizationLayerNode.h
index 94c718babb..e5d81afa0e 100644
--- a/arm_compute/graph/nodes/QuantizationLayerNode.h
+++ b/arm_compute/graph/nodes/QuantizationLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,13 @@ class QuantizationLayerNode final : public INode
      */
     QuantizationLayerNode(QuantizationInfo out_quant_info);
 
+    /** Constructor
+     *
+     * @param[in] out_quant_info Output quantization info
+     * @param[in] out_data_type  Output data type
+     */
+    QuantizationLayerNode(QuantizationInfo out_quant_info, DataType out_data_type);
+
     // Inherited overridden methods:
     NodeType         type() const override;
     bool             forward_descriptors() override;
@@ -50,6 +57,7 @@ class QuantizationLayerNode final : public INode
 
 private:
     QuantizationInfo _out_quant_info;
+    DataType         _out_data_type;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/ReductionLayerNode.h b/arm_compute/graph/nodes/ReductionLayerNode.h
new file mode 100644
index 0000000000..b8d295945c
--- /dev/null
+++ b/arm_compute/graph/nodes/ReductionLayerNode.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_GRAPH_REDUCTION_LAYER_NODE_H
+#define ARM_COMPUTE_GRAPH_REDUCTION_LAYER_NODE_H
+
+#include "arm_compute/graph/INode.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Reduction Operation node */
+class ReductionLayerNode final : public INode
+{
+public:
+    /** Default Constructor */
+    ReductionLayerNode(ReductionOperation op, unsigned int axis, bool keep_dims = true);
+    /** op accessor
+     *
+     * @return op
+     */
+    ReductionOperation op() const;
+    /** axis accessor
+     *
+     * @return axis
+     */
+    unsigned int axis() const;
+    /** keep_dims accessor
+     *
+     * @return keep_dims
+     */
+    bool keep_dims() const;
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void accept(INodeVisitor &v) override;
+
+private:
+    ReductionOperation _op;
+    unsigned int       _axis;
+    bool               _keep_dims;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_GRAPH_REDUCTION_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/StridedSliceLayerNode.h b/arm_compute/graph/nodes/StridedSliceLayerNode.h
new file mode 100644
index 0000000000..6039f312b3
--- /dev/null
+++ b/arm_compute/graph/nodes/StridedSliceLayerNode.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_GRAPH_STRIDED_SLICE_LAYER_NODE_H
+#define ARM_COMPUTE_GRAPH_STRIDED_SLICE_LAYER_NODE_H
+
+#include "arm_compute/graph/INode.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Slice Layer node */
+class StridedSliceLayerNode final : public INode
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] starts             The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends               The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides            The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strided_slice_info Contains masks for the starts, ends and strides
+     */
+    StridedSliceLayerNode(const Coordinates    &starts,
+                          const Coordinates    &ends,
+                          const BiStrides      &strides,
+                          StridedSliceLayerInfo strided_slice_info);
+    /** Computes slice layer output descriptor
+     *
+     * @param[in] input_descriptor Descriptor of the input tensor
+     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] info             Contains masks for the starts, ends and strides
+     *
+     * @return  Output descriptor
+     */
+    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                      const Coordinates      &starts,
+                                                      const Coordinates      &ends,
+                                                      const BiStrides        &strides,
+                                                      StridedSliceLayerInfo   info);
+    /** Start coordinates accessor
+     *
+     * @return Start coordinates of the dimensions
+     */
+    Coordinates starts() const;
+    /** End coordinates accessor
+     *
+     * @return End coordinates of the dimensions
+     */
+    Coordinates ends() const;
+    /** Strides vector accessor
+     *
+     * @return End coordinates of the dimensions
+     */
+    BiStrides strides() const;
+
+    StridedSliceLayerInfo strided_slice_info() const;
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void accept(INodeVisitor &v) override;
+
+private:
+    Coordinates           _starts;
+    Coordinates           _ends;
+    BiStrides             _strides;
+    StridedSliceLayerInfo _info;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_GRAPH_STRIDED_SLICE_LAYER_NODE_H */
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index f909cc30c5..4e32831bc6 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -99,6 +99,9 @@
 #include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
 #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
 #include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
+#include "arm_compute/runtime/CL/functions/CLLogicalNot.h"
+#include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
 #include "arm_compute/runtime/CL/functions/CLMagnitude.h"
 #include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h
index c978dcd92c..067c391489 100644
--- a/arm_compute/runtime/CL/CLTensorAllocator.h
+++ b/arm_compute/runtime/CL/CLTensorAllocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -108,7 +108,7 @@ class CLTensorAllocator : public ITensorAllocator
     void free() override;
     /** Import an existing memory as a tensor's backing memory
      *
-     * @warning memory should have been created under the same context that ACL uses.
+     * @warning memory should have been created under the same context that Compute Library uses.
      * @warning memory is expected to be aligned with the device requirements.
      * @warning tensor shouldn't be memory managed.
      * @warning ownership of memory is not transferred.
diff --git a/arm_compute/runtime/CL/CLTypes.h b/arm_compute/runtime/CL/CLTypes.h
index cbc525308f..19095a5589 100644
--- a/arm_compute/runtime/CL/CLTypes.h
+++ b/arm_compute/runtime/CL/CLTypes.h
@@ -53,6 +53,7 @@ struct CLGEMMKernelSelectionParams
     unsigned int m{ 0 };                         /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
     unsigned int n{ 0 };                         /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
     unsigned int k{ 0 };                         /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int b{ 0 };                         /**< Batch size */
     bool         is_rhs_constant{ false };       /**< True if the content of the rhs matrix is constant */
     DataType     data_type{ DataType::UNKNOWN }; /**< Data type */
 };
diff --git a/arm_compute/runtime/CL/ICLOperator.h b/arm_compute/runtime/CL/ICLOperator.h
index 526b7e93e9..38bcaf32f2 100644
--- a/arm_compute/runtime/CL/ICLOperator.h
+++ b/arm_compute/runtime/CL/ICLOperator.h
@@ -24,7 +24,8 @@
 #ifndef ARM_COMPUTE_ICLOPERATOR_H
 #define ARM_COMPUTE_ICLOPERATOR_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
 #include "arm_compute/runtime/IOperator.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 #include "arm_compute/runtime/Types.h"
@@ -33,6 +34,7 @@
 
 namespace arm_compute
 {
+class ICLKernel;
 namespace experimental
 {
 /** Basic interface for functions which have a single async CL kernel */
diff --git a/arm_compute/runtime/CL/ICLSimpleFunction.h b/arm_compute/runtime/CL/ICLSimpleFunction.h
index 4b1d5b1485..310bf770c4 100644
--- a/arm_compute/runtime/CL/ICLSimpleFunction.h
+++ b/arm_compute/runtime/CL/ICLSimpleFunction.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_ICLSIMPLEFUNCTION_H
 #define ARM_COMPUTE_ICLSIMPLEFUNCTION_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
@@ -34,6 +32,8 @@ namespace arm_compute
 {
 // Forward declarations
 class CLRuntimeContext;
+class CLFillBorderKernel;
+class ICLKernel;
 
 /** Basic interface for functions which have a single OpenCL kernel */
 class ICLSimpleFunction : public IFunction
@@ -53,14 +53,16 @@ class ICLSimpleFunction : public IFunction
     ICLSimpleFunction &operator=(const ICLSimpleFunction &) = delete;
     /** Default move assignment operator */
     ICLSimpleFunction &operator=(ICLSimpleFunction &&) = default;
+    /** Default destructor */
+    ~ICLSimpleFunction();
 
     // Inherited methods overridden:
     void run() override final;
 
 protected:
-    std::unique_ptr<ICLKernel> _kernel;         /**< Kernel to run */
-    CLFillBorderKernel         _border_handler; /**< Kernel to handle  borders */
-    CLRuntimeContext          *_ctx;            /**< Context to use */
+    std::unique_ptr<ICLKernel>          _kernel;         /**< Kernel to run */
+    std::unique_ptr<CLFillBorderKernel> _border_handler; /**< Kernel to handle  borders */
+    CLRuntimeContext                   *_ctx;            /**< Context to use */
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLSIMPLEFUNCTION_H */
diff --git a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
index b0f1948beb..86c8022b4f 100644
--- a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
+++ b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
@@ -28,12 +28,16 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to run @ref CLAbsoluteDifferenceKernel
  *
  * @note The tensor data types for the inputs must be U8 or S16.
  * @note The function calculates the absolute difference also when the 2 inputs have different tensor data types.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLAbsoluteDifference : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLAccumulate.h b/arm_compute/runtime/CL/functions/CLAccumulate.h
index 9dbf13b873..f78ce0e149 100644
--- a/arm_compute/runtime/CL/functions/CLAccumulate.h
+++ b/arm_compute/runtime/CL/functions/CLAccumulate.h
@@ -30,9 +30,14 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
-/** Basic function to run @ref CLAccumulateKernel */
+/** Basic function to run @ref CLAccumulateKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class CLAccumulate : public ICLSimpleFunction
 {
 public:
@@ -51,7 +56,11 @@ class CLAccumulate : public ICLSimpleFunction
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum);
 };
 
-/** Basic function to run @ref CLAccumulateWeightedKernel */
+/** Basic function to run @ref CLAccumulateWeightedKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class CLAccumulateWeighted : public ICLSimpleFunction
 {
 public:
@@ -72,7 +81,11 @@ class CLAccumulateWeighted : public ICLSimpleFunction
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum);
 };
 
-/** Basic function to run @ref CLAccumulateSquaredKernel */
+/** Basic function to run @ref CLAccumulateSquaredKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class CLAccumulateSquared : public ICLSimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h
index 632487c78d..dc2cb62b71 100644
--- a/arm_compute/runtime/CL/functions/CLActivationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h
@@ -31,7 +31,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLActivationLayerKernel
  *
diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
index dc0c37e860..c254284cd7 100644
--- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_CLARGMINMAXLAYER_H
 #define ARM_COMPUTE_CLARGMINMAXLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
@@ -36,6 +35,7 @@ namespace arm_compute
 {
 class ITensorInfo;
 class ICLTensor;
+class CLArgMinMaxLayerKernel;
 
 /** Function to calculate the index of the minimum or maximum values in a
  *  tensor based on an axis.
@@ -53,6 +53,16 @@ class CLArgMinMaxLayer : public IFunction
      * @param[in] memory_manager (Optional) Memory manager.
      */
     CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLArgMinMaxLayer(const CLArgMinMaxLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLArgMinMaxLayer &operator=(const CLArgMinMaxLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLArgMinMaxLayer(CLArgMinMaxLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLArgMinMaxLayer &operator=(CLArgMinMaxLayer &&) = delete;
+    /** Default destructor */
+    ~CLArgMinMaxLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input  Input source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
@@ -85,13 +95,13 @@ class CLArgMinMaxLayer : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                         _memory_group;
-    std::vector<CLTensor>               _results_vector;
-    CLTensor                            _not_reshaped_output;
-    std::vector<CLArgMinMaxLayerKernel> _reduction_kernels_vector;
-    CLReshapeLayer                      _reshape;
-    unsigned int                        _num_of_stages;
-    unsigned int                        _reduction_axis;
+    MemoryGroup                                          _memory_group;
+    std::vector<CLTensor>                                _results_vector;
+    CLTensor                                             _not_reshaped_output;
+    std::vector<std::unique_ptr<CLArgMinMaxLayerKernel>> _reduction_kernels_vector;
+    CLReshapeLayer                                       _reshape;
+    unsigned int                                         _num_of_stages;
+    unsigned int                                         _reduction_axis;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLARGMINMAXLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
index c22991da7c..c8acf9fc6b 100644
--- a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
@@ -26,12 +26,16 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 #include "arm_compute/core/Types.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
+class CLBatchNormalizationLayerKernel;
 
 /** Basic function to run @ref CLNormalizationLayerKernel and simulate a batch normalization layer.
  *
@@ -44,6 +48,16 @@ class CLBatchNormalizationLayer : public IFunction
 public:
     /** Default constructor */
     CLBatchNormalizationLayer();
+    /** Prevent instances of this class from being copied */
+    CLBatchNormalizationLayer(const CLBatchNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLBatchNormalizationLayer &operator=(const CLBatchNormalizationLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLBatchNormalizationLayer(CLBatchNormalizationLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLBatchNormalizationLayer &operator=(CLBatchNormalizationLayer &&) = delete;
+    /** Default destructor */
+    ~CLBatchNormalizationLayer();
     /** Set the input and output tensors.
      *
      * @note If the output tensor is a nullptr or is equal to the input, the batch normalization function will be performed in-place
@@ -104,7 +118,7 @@ class CLBatchNormalizationLayer : public IFunction
     void run() override;
 
 private:
-    CLBatchNormalizationLayerKernel _norm_kernel; /**< BatchNormalization layer kernel to run */
+    std::unique_ptr<CLBatchNormalizationLayerKernel> _norm_kernel; /**< BatchNormalization layer kernel to run */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
index ba57921cc2..bdb58531d0 100644
--- a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
@@ -26,11 +26,15 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 #include "arm_compute/core/Types.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
+class ITensorInfo;
+class CLBatchToSpaceLayerKernel;
 class ICLTensor;
 
 /** Basic function to run @ref CLBatchToSpaceLayerKernel. */
@@ -39,6 +43,16 @@ class CLBatchToSpaceLayer : public IFunction
 public:
     /** Default constructor */
     CLBatchToSpaceLayer();
+    /** Prevent instances of this class from being copied */
+    CLBatchToSpaceLayer(const CLBatchToSpaceLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLBatchToSpaceLayer &operator=(const CLBatchToSpaceLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLBatchToSpaceLayer(CLBatchToSpaceLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLBatchToSpaceLayer &operator=(CLBatchToSpaceLayer &&) = delete;
+    /** Default destructor */
+    ~CLBatchToSpaceLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -95,7 +109,7 @@ class CLBatchToSpaceLayer : public IFunction
     void run() override;
 
 private:
-    CLBatchToSpaceLayerKernel _batch_to_space_kernel; /**< CLBatchToSpaceLayerKernel to run */
+    std::unique_ptr<CLBatchToSpaceLayerKernel> _batch_to_space_kernel; /**< CLBatchToSpaceLayerKernel to run */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBATCHTOSPACELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
index 3c28938807..bf5993f4b0 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
@@ -28,6 +28,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to run @ref CLBitwiseAndKernel.
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseNot.h b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
index 4c21d5647f..1d8531a176 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseNot.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
@@ -28,6 +28,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to run @ref CLBitwiseNotKernel.
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseOr.h b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
index 8a481737e3..7876cbf196 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseOr.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
@@ -28,6 +28,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to run @ref CLBitwiseOrKernel.
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseXor.h b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
index 6928e59d38..4f054062cd 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseXor.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
@@ -28,6 +28,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to run @ref CLBitwiseXorKernel.
diff --git a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
index 5e4e89071b..d6409106da 100644
--- a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
+++ b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
@@ -24,12 +24,16 @@
 #ifndef ARM_COMPUTE_CLBOUNDINGBOXTRANSOFORM_H
 #define ARM_COMPUTE_CLBOUNDINGBOXTRANSOFORM_H
 
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLBoundingBoxTransformKernel;
+class BoundingBoxTransformInfo;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLBoundingBoxTransformKernel.
  *
diff --git a/arm_compute/runtime/CL/functions/CLBox3x3.h b/arm_compute/runtime/CL/functions/CLBox3x3.h
index 2d2aa4705c..63c5d3f897 100644
--- a/arm_compute/runtime/CL/functions/CLBox3x3.h
+++ b/arm_compute/runtime/CL/functions/CLBox3x3.h
@@ -31,6 +31,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to execute box filter 3x3. This function calls the following OpenCL kernels:
@@ -38,6 +39,8 @@ class ICLTensor;
  *  -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  *  -# @ref CLBox3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLBox3x3 : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLCannyEdge.h b/arm_compute/runtime/CL/functions/CLCannyEdge.h
index f9d9f8f66a..1c48d690a5 100644
--- a/arm_compute/runtime/CL/functions/CLCannyEdge.h
+++ b/arm_compute/runtime/CL/functions/CLCannyEdge.h
@@ -26,8 +26,6 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -36,6 +34,11 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLFillBorderKernel;
+class CLGradientKernel;
+class CLEdgeNonMaxSuppressionKernel;
+class CLEdgeTraceKernel;
 class ICLTensor;
 
 /** Basic function to execute canny edge on OpenCL. This function calls the following OpenCL kernels and functions:
@@ -46,6 +49,8 @@ class ICLTensor;
  * -# @ref CLEdgeNonMaxSuppressionKernel
  * -# @ref CLEdgeTraceKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLCannyEdge : public IFunction
 {
@@ -56,6 +61,8 @@ class CLCannyEdge : public IFunction
     CLCannyEdge(const CLCannyEdge &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLCannyEdge &operator=(const CLCannyEdge &) = delete;
+    /** Default destructor */
+    ~CLCannyEdge();
     /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode.
      *
      * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for border_mode != UNDEFINED)
@@ -88,20 +95,20 @@ class CLCannyEdge : public IFunction
     virtual void run() override;
 
 private:
-    MemoryGroup                   _memory_group;                                    /**< Function's memory group */
-    std::unique_ptr<IFunction>    _sobel;                                           /**< Pointer to Sobel kernel. */
-    CLGradientKernel              _gradient;                                        /**< Gradient kernel. */
-    CLFillBorderKernel            _border_mag_gradient;                             /**< Fill border on magnitude tensor kernel */
-    CLEdgeNonMaxSuppressionKernel _non_max_suppr;                                   /**< Non-Maxima suppression kernel. */
-    CLEdgeTraceKernel             _edge_trace;                                      /**< Edge tracing kernel. */
-    CLImage                       _gx;                                              /**< Source tensor - Gx component. */
-    CLImage                       _gy;                                              /**< Source tensor - Gy component. */
-    CLImage                       _mag;                                             /**< Source tensor - Magnitude. */
-    CLImage                       _phase;                                           /**< Source tensor - Phase. */
-    CLImage                       _nonmax;                                          /**< Source tensor - Non-Maxima suppressed. */
-    CLImage                       _visited, _recorded, _l1_list_counter, _l1_stack; /**< Temporary tensors */
-    ICLTensor                    *_output;                                          /**< Output tensor provided by the user. */
+    MemoryGroup                                    _memory_group;                                    /**< Function's memory group */
+    std::unique_ptr<IFunction>                     _sobel;                                           /**< Pointer to Sobel kernel. */
+    std::unique_ptr<CLGradientKernel>              _gradient;                                        /**< Gradient kernel. */
+    std::unique_ptr<CLFillBorderKernel>            _border_mag_gradient;                             /**< Fill border on magnitude tensor kernel */
+    std::unique_ptr<CLEdgeNonMaxSuppressionKernel> _non_max_suppr;                                   /**< Non-Maxima suppression kernel. */
+    std::unique_ptr<CLEdgeTraceKernel>             _edge_trace;                                      /**< Edge tracing kernel. */
+    CLImage                                        _gx;                                              /**< Source tensor - Gx component. */
+    CLImage                                        _gy;                                              /**< Source tensor - Gy component. */
+    CLImage                                        _mag;                                             /**< Source tensor - Magnitude. */
+    CLImage                                        _phase;                                           /**< Source tensor - Phase. */
+    CLImage                                        _nonmax;                                          /**< Source tensor - Non-Maxima suppressed. */
+    CLImage                                        _visited, _recorded, _l1_list_counter, _l1_stack; /**< Temporary tensors */
+    ICLTensor                                     *_output;                                          /**< Output tensor provided by the user. */
 };
-}
+} // namespace arm_compute
 
 #endif /* ARM_COMPUTE_CLCANNYEDGE_H */
diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h
index 592368d135..bd333d4e72 100644
--- a/arm_compute/runtime/CL/functions/CLCast.h
+++ b/arm_compute/runtime/CL/functions/CLCast.h
@@ -31,7 +31,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLDepthConvertLayerKernel. */
 class CLCast : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLChannelCombine.h b/arm_compute/runtime/CL/functions/CLChannelCombine.h
index 4e3d10cc10..2a36d3f742 100644
--- a/arm_compute/runtime/CL/functions/CLChannelCombine.h
+++ b/arm_compute/runtime/CL/functions/CLChannelCombine.h
@@ -28,11 +28,16 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLMultiImage;
 class ICLTensor;
 using ICLImage = ICLTensor;
 
-/** Basic function to run @ref CLChannelCombineKernel to perform channel combination. */
+/** Basic function to run @ref CLChannelCombineKernel to perform channel combination.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class CLChannelCombine : public ICLSimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/CL/functions/CLChannelExtract.h b/arm_compute/runtime/CL/functions/CLChannelExtract.h
index cf042b4519..6cd24648ba 100644
--- a/arm_compute/runtime/CL/functions/CLChannelExtract.h
+++ b/arm_compute/runtime/CL/functions/CLChannelExtract.h
@@ -29,11 +29,16 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLMultiImage;
 class ICLTensor;
 using ICLImage = ICLTensor;
 
-/** Basic function to run @ref CLChannelExtractKernel to perform channel extraction. */
+/** Basic function to run @ref CLChannelExtractKernel to perform channel extraction.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class CLChannelExtract : public ICLSimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
index e0bb3d01c9..54cf59f59a 100644
--- a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
+++ b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
@@ -24,11 +24,14 @@
 #ifndef ARM_COMPUTE_CLCHANNELSHUFFLELAYER_H
 #define ARM_COMPUTE_CLCHANNELSHUFFLELAYER_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLChannelShuffleLayerKernel
  *
diff --git a/arm_compute/runtime/CL/functions/CLColorConvert.h b/arm_compute/runtime/CL/functions/CLColorConvert.h
index e4017c2686..f30621e911 100644
--- a/arm_compute/runtime/CL/functions/CLColorConvert.h
+++ b/arm_compute/runtime/CL/functions/CLColorConvert.h
@@ -28,6 +28,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLMultiImage;
 class ICLTensor;
 using ICLImage = ICLTensor;
@@ -35,6 +36,9 @@ using ICLImage = ICLTensor;
 /** Basic function to run @ref CLColorConvertKernel
  *
  * @note The function performs color convert between images.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLColorConvert : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLComparison.h b/arm_compute/runtime/CL/functions/CLComparison.h
index c6d61e45f2..8cc3e96ec5 100644
--- a/arm_compute/runtime/CL/functions/CLComparison.h
+++ b/arm_compute/runtime/CL/functions/CLComparison.h
@@ -30,7 +30,9 @@
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLComparisonKernel */
 class CLComparison : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h b/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h
index a2f1a4eb66..d6a2ab423d 100644
--- a/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h
+++ b/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h
@@ -24,12 +24,15 @@
 #ifndef ARM_COMPUTE_CLCOMPUTEALLANCHORS_H
 #define ARM_COMPUTE_CLCOMPUTEALLANCHORS_H
 
-#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
+class ComputeAnchorsInfo;
 
 /** Basic function to run @ref CLComputeAllAnchorsKernel.
  *
diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
index f535c8ea97..5e7003a112 100644
--- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
@@ -27,7 +27,6 @@
 #include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
 
 #include <memory>
@@ -36,7 +35,9 @@
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
 class ICLTensor;
+class ICLKernel;
 class ITensorInfo;
 class Status;
 
diff --git a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
index 9298be2e53..75a3d3213e 100644
--- a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
@@ -24,14 +24,17 @@
 #ifndef ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTS_H
 #define ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTS_H
 
-#include "arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/ITransformWeights.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLConvertFullyConnectedWeightsKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLConvertFullyConnectedWeightsKernel. */
 class CLConvertFullyConnectedWeights : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLConvolution.h b/arm_compute/runtime/CL/functions/CLConvolution.h
index c06ad0d969..44346767f3 100644
--- a/arm_compute/runtime/CL/functions/CLConvolution.h
+++ b/arm_compute/runtime/CL/functions/CLConvolution.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLCONVOLUTION_H
 #define ARM_COMPUTE_CLCONVOLUTION_H
 
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
@@ -38,6 +36,13 @@
 
 namespace arm_compute
 {
+template <unsigned int matrix_size>
+class CLConvolutionKernel;
+template <unsigned int matrix_size>
+class CLSeparableConvolutionHorKernel;
+template <unsigned int matrix_size>
+class CLSeparableConvolutionVertKernel;
+class CLFillBorderKernel;
 class ICLTensor;
 
 /** Basic function to execute convolution of size 3x3. This function calls the following OpenCL kernels:
@@ -45,6 +50,8 @@ class ICLTensor;
  * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref CLConvolution3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLConvolution3x3 : public ICLSimpleFunction
 {
@@ -78,6 +85,8 @@ class CLConvolution3x3 : public ICLSimpleFunction
  * -# @ref CLConvolutionKernel or<br/>
  *    @ref CLSeparableConvolutionHorKernel and @ref CLSeparableConvolutionVertKernel (if convolution matrix is separable)
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 template <unsigned int matrix_size>
 class CLConvolutionSquare : public IFunction
@@ -85,6 +94,16 @@ class CLConvolutionSquare : public IFunction
 public:
     /** Default constructor */
     CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvolutionSquare(const CLConvolutionSquare &) = delete;
+    /** Default move constructor */
+    CLConvolutionSquare(CLConvolutionSquare &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvolutionSquare &operator=(const CLConvolutionSquare &) = delete;
+    /** Default move assignment operator */
+    CLConvolutionSquare &operator=(CLConvolutionSquare &&) = default;
+    /** Default destructor */
+    ~CLConvolutionSquare();
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -111,13 +130,13 @@ class CLConvolutionSquare : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                                   _memory_group;   /**< Function's memory group */
-    CLTensor                                      _tmp;            /**< temporary buffer for output of horizontal pass */
-    bool                                          _is_separable;   /**< true if the convolution can be separated */
-    CLSeparableConvolutionHorKernel<matrix_size>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
-    CLSeparableConvolutionVertKernel<matrix_size> _kernel_vert;    /**< kernel for vertical pass of separated convolution */
-    CLConvolutionKernel<matrix_size>              _kernel;         /**< kernel for non-separated convolution **/
-    CLFillBorderKernel                            _border_handler; /**< kernel for border handling */
+    MemoryGroup                                                    _memory_group;   /**< Function's memory group */
+    CLTensor                                                       _tmp;            /**< temporary buffer for output of horizontal pass */
+    bool                                                           _is_separable;   /**< true if the convolution can be separated */
+    std::unique_ptr<CLSeparableConvolutionHorKernel<matrix_size>>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
+    std::unique_ptr<CLSeparableConvolutionVertKernel<matrix_size>> _kernel_vert;    /**< kernel for vertical pass of separated convolution */
+    std::unique_ptr<CLConvolutionKernel<matrix_size>>              _kernel;         /**< kernel for non-separated convolution **/
+    std::unique_ptr<CLFillBorderKernel>                            _border_handler; /**< kernel for border handling */
 };
 
 /** Basic function to run 5x5 convolution. */
@@ -133,6 +152,9 @@ using CLConvolution9x9 = CLConvolutionSquare<9>;
  * -# @ref CLConvolutionRectangleKernel or<br/>
  *
  * @note Convolution rectangle should have dimensions of 3, 5, 7, 9
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLConvolutionRectangle : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index ac36523682..d1de721193 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -73,6 +73,16 @@ class CLConvolutionLayer : public IFunction
 public:
     /** Default constructor */
     CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Default Destructor */
+    ~CLConvolutionLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvolutionLayer(const CLConvolutionLayer &) = delete;
+    /** Default move constructor */
+    CLConvolutionLayer(CLConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvolutionLayer &operator=(const CLConvolutionLayer &) = delete;
+    /** Default move assignment operator */
+    CLConvolutionLayer &operator=(CLConvolutionLayer &&) = default;
     /** Set the input and output tensors.
      *
      * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
diff --git a/arm_compute/runtime/CL/functions/CLCopy.h b/arm_compute/runtime/CL/functions/CLCopy.h
index c20d75eea8..f1a091df84 100644
--- a/arm_compute/runtime/CL/functions/CLCopy.h
+++ b/arm_compute/runtime/CL/functions/CLCopy.h
@@ -31,7 +31,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 class CLCopy : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLCropResize.h b/arm_compute/runtime/CL/functions/CLCropResize.h
index e940928b90..e781cfe61f 100644
--- a/arm_compute/runtime/CL/functions/CLCropResize.h
+++ b/arm_compute/runtime/CL/functions/CLCropResize.h
@@ -25,9 +25,7 @@
 #define ARM_COMPUTE_CL_CROP_RESIZE_H
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLCropKernel.h"
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLScale.h"
 
@@ -37,7 +35,11 @@
 namespace arm_compute
 {
 // Forward Declarations
+class CLCompileContext;
+class CLCopyKernel;
+class CLCropKernel;
 class ITensor;
+class ITensorInfo;
 
 /** Function to perform cropping and resizing */
 class CLCropResize : public IFunction
@@ -54,7 +56,7 @@ class CLCropResize : public IFunction
     /** Allow instances of this class to be moved */
     CLCropResize &operator=(CLCropResize &&) = default;
     /** Default destructor */
-    virtual ~CLCropResize() = default;
+    ~CLCropResize();
 
     /** Configure kernel
      *
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
index 19a44f7b93..3ebc858d32 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
@@ -24,17 +24,20 @@
 #ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H
 #define ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
+class CLDeconvolutionLayerUpsampleKernel;
+class CLCompileContext;
+class CLMemsetKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to execute deconvolution upsample on OpenCL. This function calls the following OpenCL kernels and functions:
  *
@@ -55,7 +58,7 @@ class CLDeconvolutionLayerUpsample : public IFunction
     /** Allow instances of this class to be moved */
     CLDeconvolutionLayerUpsample &operator=(CLDeconvolutionLayerUpsample &&) = default;
     /** Default destructor */
-    virtual ~CLDeconvolutionLayerUpsample() = default;
+    ~CLDeconvolutionLayerUpsample();
 
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
@@ -86,9 +89,9 @@ class CLDeconvolutionLayerUpsample : public IFunction
     void run() override;
 
 private:
-    CLDeconvolutionLayerUpsampleKernel _upsample;
-    CLMemsetKernel                     _memset;
-    ICLTensor                         *_output;
+    std::unique_ptr<CLDeconvolutionLayerUpsampleKernel> _upsample;
+    std::unique_ptr<CLMemsetKernel>                     _memset;
+    ICLTensor                                          *_output;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H */
diff --git a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
index d125584c97..b0f297aec5 100644
--- a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
@@ -31,7 +31,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLDepthConvertLayerKernel. */
 class CLDepthConvertLayer : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
index 5e197cb9b8..a0aa288dbf 100644
--- a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
@@ -29,7 +29,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLDepthToSpaceLayerKernel. */
 class CLDepthToSpaceLayer : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index 570b6ca38f..8e594bc09f 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -24,12 +24,6 @@
 #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H
 #define ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H
 
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
@@ -38,6 +32,11 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLFillBorderKernel;
+class CLDepthwiseConvolutionLayerNativeKernel;
+class CLDepthwiseConvolutionLayerReshapeWeightsKernel;
+class ICLDepthwiseConvolutionLayer3x3Kernel;
 class ICLTensor;
 
 /** Function to execute a depthwise convolution
@@ -55,13 +54,15 @@ class CLDepthwiseConvolutionLayer : public IFunction
     CLDepthwiseConvolutionLayer &operator=(const CLDepthwiseConvolutionLayer &) = delete;
     /** Default move assignment operator */
     CLDepthwiseConvolutionLayer &operator=(CLDepthwiseConvolutionLayer &&) = default;
+    /** Default destructor */
+    ~CLDepthwiseConvolutionLayer();
     /** Initialize the function's source, destination, weights and convolution information.
      *
-     * @param[in, out] input            Source tensor. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW
+     * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW
      * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                                  Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                                  Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                                  Data type supported: Same as @p input, S32 when input is QASYMM8.
+     *                                  Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
      * @param[out]     output           Destination tensor. Data type supported: same as @p input.
      * @param[in]      conv_info        Padding and stride information to use for the convolution.
      * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
@@ -73,11 +74,11 @@ class CLDepthwiseConvolutionLayer : public IFunction
     /** Initialize the function's source, destination, weights and convolution information.
      *
      * @param[in]      compile_context  The compile context to be used.
-     * @param[in, out] input            Source tensor. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW
+     * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW
      * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                                  Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                                  Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                                  Data type supported: Same as @p input, S32 when input is QASYMM8.
+     *                                  Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
      * @param[out]     output           Destination tensor. Data type supported: same as @p input.
      * @param[in]      conv_info        Padding and stride information to use for the convolution.
      * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
@@ -89,11 +90,11 @@ class CLDepthwiseConvolutionLayer : public IFunction
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer
      *
-     * @param[in] input            Source tensor info. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW
+     * @param[in] input            Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW
      * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                             Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
+     *                             Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
      * @param[in] output           Destination tensor. Data type supported: same as @p input.
      * @param[in] conv_info        Padding and stride information to use for the convolution.
      * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
@@ -211,25 +212,25 @@ class CLDepthwiseConvolutionLayer : public IFunction
         };
 
     private:
-        MemoryGroup                                            _memory_group;
-        std::unique_ptr<ICLDepthwiseConvolutionLayer3x3Kernel> _kernel;
-        CLFillBorderKernel                                     _border_handler;
-        CLPermute                                              _permute_input_to_nchw;
-        CLPermute                                              _permute_weights_to_nchw;
-        CLPermute                                              _permute_output_to_nhwc;
-        CLDepthwiseConvolutionLayerReshapeWeightsKernel        _reshape_weights;
-        CLTensor                                               _permuted_input;
-        CLTensor                                               _permuted_weights;
-        CLTensor                                               _permuted_output;
-        CLTensor                                               _output_multipliers;
-        CLTensor                                               _output_shifts;
-        const ITensor                                         *_original_weights;
-        const ITensor                                         *_input;
-        const ITensor                                         *_output;
-        bool                                                   _needs_permute;
-        bool                                                   _needs_weights_reshape;
-        bool                                                   _is_prepared;
-        bool                                                   _is_quantized;
+        MemoryGroup                                                      _memory_group;
+        std::unique_ptr<ICLDepthwiseConvolutionLayer3x3Kernel>           _kernel;
+        std::unique_ptr<CLFillBorderKernel>                              _border_handler;
+        CLPermute                                                        _permute_input_to_nchw;
+        CLPermute                                                        _permute_weights_to_nchw;
+        CLPermute                                                        _permute_output_to_nhwc;
+        std::unique_ptr<CLDepthwiseConvolutionLayerReshapeWeightsKernel> _reshape_weights;
+        CLTensor                                                         _permuted_input;
+        CLTensor                                                         _permuted_weights;
+        CLTensor                                                         _permuted_output;
+        CLTensor                                                         _output_multipliers;
+        CLTensor                                                         _output_shifts;
+        const ITensor                                                   *_original_weights;
+        const ITensor                                                   *_input;
+        const ITensor                                                   *_output;
+        bool                                                             _needs_permute;
+        bool                                                             _needs_weights_reshape;
+        bool                                                             _is_prepared;
+        bool                                                             _is_quantized;
     };
 
     /** Basic function to execute a generic depthwise convolution. This function calls the following OpenCL kernels:
@@ -313,10 +314,10 @@ class CLDepthwiseConvolutionLayer : public IFunction
     private:
         MemoryGroup _memory_group;
 
-        CLDepthwiseConvolutionLayerNativeKernel _dwc_native_kernel;
-        CLPermute                               _permute_input_to_nhwc;
-        CLPermute                               _permute_weights_to_nhwc;
-        CLPermute                               _permute_output_to_nchw;
+        std::unique_ptr<CLDepthwiseConvolutionLayerNativeKernel> _dwc_native_kernel;
+        CLPermute                                                _permute_input_to_nhwc;
+        CLPermute                                                _permute_weights_to_nhwc;
+        CLPermute                                                _permute_output_to_nchw;
 
         CLTensor       _permuted_input;
         CLTensor       _permuted_weights;
diff --git a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
index 88ed915421..b2cf3356f4 100644
--- a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
@@ -31,7 +31,9 @@
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLDequantizationLayerKernel that dequantizes an input tensor */
 class CLDequantizationLayer : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLDerivative.h b/arm_compute/runtime/CL/functions/CLDerivative.h
index 1aba6a9f6c..8918dac0ea 100644
--- a/arm_compute/runtime/CL/functions/CLDerivative.h
+++ b/arm_compute/runtime/CL/functions/CLDerivative.h
@@ -31,6 +31,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to execute first order derivative operator. This function calls the following CL kernels:
@@ -38,6 +39,8 @@ class ICLTensor;
  * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref CLDerivativeKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLDerivative : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLDilate.h b/arm_compute/runtime/CL/functions/CLDilate.h
index adb9cf4e6c..e15621b5a4 100644
--- a/arm_compute/runtime/CL/functions/CLDilate.h
+++ b/arm_compute/runtime/CL/functions/CLDilate.h
@@ -31,13 +31,16 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to execute dilate. This function calls the following OpenCL kernels:
 *
 * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
 * -# @ref CLDilateKernel
-*
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
 */
 class CLDilate : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
index 8107fa24f3..0afc9d3f38 100644
--- a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -34,7 +32,11 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLDirectConvolutionLayerKernel;
+class CLFillBorderKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to execute direct convolution function:
  */
@@ -43,6 +45,12 @@ class CLDirectConvolutionLayer : public IFunction
 public:
     /** Default constructor */
     CLDirectConvolutionLayer();
+    /** Prevent instances of this class from being copied */
+    CLDirectConvolutionLayer(const CLDirectConvolutionLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLDirectConvolutionLayer &operator=(const CLDirectConvolutionLayer &) = delete;
+    /** Default destructor */
+    ~CLDirectConvolutionLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -95,9 +103,9 @@ class CLDirectConvolutionLayer : public IFunction
     void run() override;
 
 private:
-    CLDirectConvolutionLayerKernel _direct_conv_kernel;
-    CLFillBorderKernel             _input_border_handler;
-    CLActivationLayer              _activationlayer_function;
+    std::unique_ptr<CLDirectConvolutionLayerKernel> _direct_conv_kernel;
+    std::unique_ptr<CLFillBorderKernel>             _input_border_handler;
+    CLActivationLayer                               _activationlayer_function;
 
     bool _is_activationlayer_enabled;
 };
diff --git a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h b/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h
index 5208bfe404..72b5b7dee8 100644
--- a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h
+++ b/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h
@@ -29,7 +29,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to perform inverse square root on an input tensor. */
 class CLRsqrtLayer : public IFunction
diff --git a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
index 2d9d43863d..55c5fb3455 100644
--- a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
@@ -24,13 +24,14 @@
 #ifndef ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H
 #define ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H
 
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
+class CLCompileContext;
+class ITensorInfo;
 
 namespace experimental
 {
@@ -99,9 +100,6 @@ class CLArithmeticAddition : public ICLOperator
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
-
-private:
-    CLFillBorderKernel _border_handler;
 };
 
 /** Basic function to run @ref CLSaturatedArithmeticOperationKernel for subtraction
@@ -169,9 +167,6 @@ class CLArithmeticSubtraction : public ICLOperator
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
-
-private:
-    CLFillBorderKernel _border_handler;
 };
 
 /** Basic function to run @ref CLSaturatedArithmeticOperationKernel for division
@@ -208,9 +203,6 @@ class CLArithmeticDivision : public ICLOperator
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
-
-private:
-    CLFillBorderKernel _border_handler;
 };
 
 /** Basic function to run @ref CLArithmeticOperationKernel for max
@@ -247,9 +239,6 @@ class CLElementwiseMax : public ICLOperator
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
-
-private:
-    CLFillBorderKernel _border_handler;
 };
 
 /** Basic function to run @ref CLArithmeticOperationKernel for min
@@ -286,9 +275,6 @@ class CLElementwiseMin : public ICLOperator
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
-
-private:
-    CLFillBorderKernel _border_handler;
 };
 
 /** Basic function to run @ref CLArithmeticOperationKernel for squared difference
@@ -325,9 +311,6 @@ class CLElementwiseSquaredDiff : public ICLOperator
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
-
-private:
-    CLFillBorderKernel _border_handler;
 };
 
 /** Basic function to run @ref CLArithmeticOperationKernel for power
@@ -364,9 +347,6 @@ class CLElementwisePower : public ICLOperator
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
-
-private:
-    CLFillBorderKernel _border_handler;
 };
 } // namespace experimental
 
diff --git a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
index 883f330b33..41479e3f22 100644
--- a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
+++ b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
@@ -24,16 +24,19 @@
 #ifndef ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H
 #define ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H
 
-#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
-#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
 #include "arm_compute/runtime/CL/CLDistribution1D.h"
 #include "arm_compute/runtime/CL/CLLut.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLHistogramKernel;
+class CLHistogramBorderKernel;
+class CLTableLookupKernel;
 class ICLTensor;
 using ICLImage = ICLTensor;
 
@@ -42,12 +45,20 @@ using ICLImage = ICLTensor;
  * -# @ref CLHistogramKernel
  * -# @ref CLTableLookupKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLEqualizeHistogram : public IFunction
 {
 public:
     /** Default Constructor. */
     CLEqualizeHistogram();
+    /** Prevent instances of this class from being copied */
+    CLEqualizeHistogram(const CLEqualizeHistogram &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLEqualizeHistogram &operator=(const CLEqualizeHistogram &) = delete;
+    /** Default destructor */
+    ~CLEqualizeHistogram();
     /** Initialise the kernel's inputs.
      *
      * @param[in]  input  Input image. Data types supported: U8.
@@ -66,14 +77,14 @@ class CLEqualizeHistogram : public IFunction
     void run() override;
 
 private:
-    CLHistogramKernel       _histogram_kernel;        /**< Kernel that calculates the histogram of input. */
-    CLHistogramBorderKernel _border_histogram_kernel; /**< Kernel that calculates the histogram on the borders. */
-    CLTableLookupKernel     _map_histogram_kernel;    /**< Kernel that maps the input to output using the lut. */
-    CLDistribution1D        _hist;                    /**< Distribution that holds the histogram of the input image. */
-    CLDistribution1D        _cum_dist;                /**< Distribution that holds the cummulative distribution of the input histogram. */
-    CLLut                   _cd_lut;                  /**< Holds the equalization lookuptable. */
-    static const uint32_t   max_range = 256;          /**< Histogram range of the internal histograms. */
-    static const uint32_t   nr_bins   = 256;          /**< Histogram bins of the internal histograms. */
+    std::unique_ptr<CLHistogramKernel>       _histogram_kernel;        /**< Kernel that calculates the histogram of input. */
+    std::unique_ptr<CLHistogramBorderKernel> _border_histogram_kernel; /**< Kernel that calculates the histogram on the borders. */
+    std::unique_ptr<CLTableLookupKernel>     _map_histogram_kernel;    /**< Kernel that maps the input to output using the lut. */
+    CLDistribution1D                         _hist;                    /**< Distribution that holds the histogram of the input image. */
+    CLDistribution1D                         _cum_dist;                /**< Distribution that holds the cummulative distribution of the input histogram. */
+    CLLut                                    _cd_lut;                  /**< Holds the equalization lookuptable. */
+    static const uint32_t                    max_range = 256;          /**< Histogram range of the internal histograms. */
+    static const uint32_t                    nr_bins   = 256;          /**< Histogram bins of the internal histograms. */
 };
 }
 #endif /*ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H */
diff --git a/arm_compute/runtime/CL/functions/CLErode.h b/arm_compute/runtime/CL/functions/CLErode.h
index f8f1c72bc0..bd66ed983b 100644
--- a/arm_compute/runtime/CL/functions/CLErode.h
+++ b/arm_compute/runtime/CL/functions/CLErode.h
@@ -31,13 +31,16 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to execute erode. This function calls the following OpenCL kernels:
 *
 * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
 * -# @ref CLErodeKernel
-*
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
 */
 class CLErode : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLFFT1D.h b/arm_compute/runtime/CL/functions/CLFFT1D.h
index a6a35ab320..e88ee7650d 100644
--- a/arm_compute/runtime/CL/functions/CLFFT1D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT1D.h
@@ -26,9 +26,6 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -36,6 +33,9 @@
 namespace arm_compute
 {
 // Forward declaration
+class CLFFTDigitReverseKernel;
+class CLFFTRadixStageKernel;
+class CLFFTScaleKernel;
 class ICLTensor;
 
 /** Basic function to execute one dimensional FFT. This function calls the following OpenCL kernels:
@@ -49,6 +49,16 @@ class CLFFT1D : public IFunction
 public:
     /** Default Constructor */
     CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLFFT1D(const CLFFT1D &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLFFT1D &operator=(const CLFFT1D &) = delete;
+    /** Default move constructor */
+    CLFFT1D(CLFFT1D &&) = default;
+    /** Default move assignment operator */
+    CLFFT1D &operator=(CLFFT1D &&) = default;
+    /** Default destructor */
+    ~CLFFT1D();
     /** Initialise the function's source, destinations and border mode.
      *
      * @param[in]  input  Source tensor. Data types supported: F32.
@@ -78,14 +88,14 @@ class CLFFT1D : public IFunction
     void run() override;
 
 protected:
-    MemoryGroup                        _memory_group;
-    CLFFTDigitReverseKernel            _digit_reverse_kernel;
-    std::vector<CLFFTRadixStageKernel> _fft_kernels;
-    CLFFTScaleKernel                   _scale_kernel;
-    CLTensor                           _digit_reversed_input;
-    CLTensor                           _digit_reverse_indices;
-    unsigned int                       _num_ffts;
-    bool                               _run_scale;
+    MemoryGroup                                         _memory_group;
+    std::unique_ptr<CLFFTDigitReverseKernel>            _digit_reverse_kernel;
+    std::vector<std::unique_ptr<CLFFTRadixStageKernel>> _fft_kernels;
+    std::unique_ptr<CLFFTScaleKernel>                   _scale_kernel;
+    CLTensor                                            _digit_reversed_input;
+    CLTensor                                            _digit_reverse_indices;
+    unsigned int                                        _num_ffts;
+    bool                                                _run_scale;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLFFT1D_H */
diff --git a/arm_compute/runtime/CL/functions/CLFFT2D.h b/arm_compute/runtime/CL/functions/CLFFT2D.h
index 9ceebeaa32..c54127f209 100644
--- a/arm_compute/runtime/CL/functions/CLFFT2D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT2D.h
@@ -46,6 +46,16 @@ class CLFFT2D : public IFunction
 public:
     /** Default Constructor */
     CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLFFT2D(const CLFFT2D &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLFFT2D &operator=(const CLFFT2D &) = delete;
+    /** Default move constructor */
+    CLFFT2D(CLFFT2D &&) = default;
+    /** Default move assignment operator */
+    CLFFT2D &operator=(CLFFT2D &&) = default;
+    /** Default destructor */
+    ~CLFFT2D();
     /** Initialise the function's source, destinations and border mode.
      *
      * @param[in]  input  Source tensor. Data types supported: F32.
diff --git a/arm_compute/runtime/CL/functions/CLFastCorners.h b/arm_compute/runtime/CL/functions/CLFastCorners.h
index 698cc67995..608fdf8002 100644
--- a/arm_compute/runtime/CL/functions/CLFastCorners.h
+++ b/arm_compute/runtime/CL/functions/CLFastCorners.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_CLFASTCORNERS_H
 
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/CL/CLArray.h"
@@ -40,6 +39,8 @@
 
 namespace arm_compute
 {
+class CLFastCornersKernel;
+class CLCopyToArrayKernel;
 class ICLTensor;
 using ICLImage = ICLTensor;
 
@@ -49,6 +50,8 @@ using ICLImage = ICLTensor;
  * -# @ref CLNonMaximaSuppression3x3Kernel (executed if nonmax_suppression == true)
  * -# @ref CLCopyToArrayKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLFastCorners : public IFunction
 {
@@ -59,6 +62,8 @@ class CLFastCorners : public IFunction
     CLFastCorners(const CLFastCorners &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     const CLFastCorners &operator=(const CLFastCorners &) = delete;
+    /** Default destructor */
+    ~CLFastCorners();
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in]     input                 Source image. Data types supported: U8.
@@ -88,18 +93,18 @@ class CLFastCorners : public IFunction
     void run() override;
 
 private:
-    MemoryGroup               _memory_group;
-    CLFastCornersKernel       _fast_corners_kernel;
-    CLNonMaximaSuppression3x3 _suppr_func;
-    CLCopyToArrayKernel       _copy_array_kernel;
-    CLImage                   _output;
-    CLImage                   _suppr;
-    Window                    _win;
-    bool                      _non_max;
-    unsigned int             *_num_corners;
-    cl::Buffer                _num_buffer;
-    ICLKeyPointArray         *_corners;
-    uint8_t                   _constant_border_value;
+    MemoryGroup                          _memory_group;
+    std::unique_ptr<CLFastCornersKernel> _fast_corners_kernel;
+    CLNonMaximaSuppression3x3            _suppr_func;
+    std::unique_ptr<CLCopyToArrayKernel> _copy_array_kernel;
+    CLImage                              _output;
+    CLImage                              _suppr;
+    Window                               _win;
+    bool                                 _non_max;
+    unsigned int                        *_num_corners;
+    cl::Buffer                           _num_buffer;
+    ICLKeyPointArray                    *_corners;
+    uint8_t                              _constant_border_value;
 };
 }
 #endif /*ARM_COMPUTE_CLFASTCORNERS_H */
diff --git a/arm_compute/runtime/CL/functions/CLFill.h b/arm_compute/runtime/CL/functions/CLFill.h
index b79b234158..fef8324432 100644
--- a/arm_compute/runtime/CL/functions/CLFill.h
+++ b/arm_compute/runtime/CL/functions/CLFill.h
@@ -30,6 +30,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Function to run @ref CLMemsetKernel to fill a tensor with a scalar value */
diff --git a/arm_compute/runtime/CL/functions/CLFillBorder.h b/arm_compute/runtime/CL/functions/CLFillBorder.h
index 18bc20e654..a4ad82dfd4 100644
--- a/arm_compute/runtime/CL/functions/CLFillBorder.h
+++ b/arm_compute/runtime/CL/functions/CLFillBorder.h
@@ -30,6 +30,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to run @ref CLFillBorderKernel */
diff --git a/arm_compute/runtime/CL/functions/CLFlattenLayer.h b/arm_compute/runtime/CL/functions/CLFlattenLayer.h
index b8139c2260..f5f4ff554f 100644
--- a/arm_compute/runtime/CL/functions/CLFlattenLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFlattenLayer.h
@@ -29,7 +29,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to execute flatten. This function calls the following OpenCL kernel:
 *
diff --git a/arm_compute/runtime/CL/functions/CLFloor.h b/arm_compute/runtime/CL/functions/CLFloor.h
index 93c3639f89..85d7071194 100644
--- a/arm_compute/runtime/CL/functions/CLFloor.h
+++ b/arm_compute/runtime/CL/functions/CLFloor.h
@@ -30,7 +30,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLFloorKernel */
 class CLFloor : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index 29788742d7..3f17e4a921 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
 #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
diff --git a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
index de6d5617c2..e35905fcf1 100644
--- a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
+++ b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
@@ -24,14 +24,18 @@
 #ifndef ARM_COMPUTE_CLFUSEBATCHNORMALIZATION_H
 #define ARM_COMPUTE_CLFUSEBATCHNORMALIZATION_H
 
-#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
+class CLFuseBatchNormalizationKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to fuse the batch normalization node to a preceding convolution node */
 class CLFuseBatchNormalization : public IFunction
@@ -48,7 +52,7 @@ class CLFuseBatchNormalization : public IFunction
     /** Allow instances of this class to be moved */
     CLFuseBatchNormalization &operator=(CLFuseBatchNormalization &&) = default;
     /** Default destructor */
-    ~CLFuseBatchNormalization() = default;
+    ~CLFuseBatchNormalization();
     /** Set the input and output tensors.
      *
      * @param[in]  input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -112,7 +116,7 @@ class CLFuseBatchNormalization : public IFunction
     void run() override;
 
 private:
-    CLFuseBatchNormalizationKernel _fuse_bn_kernel;
+    std::unique_ptr<CLFuseBatchNormalizationKernel> _fuse_bn_kernel;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLFUSEBATCHNORMALIZATION_H */
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index 8e4d3906d1..3d645bdbff 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -24,11 +24,6 @@
 #ifndef ARM_COMPUTE_CLGEMM_H
 #define ARM_COMPUTE_CLGEMM_H
 
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -36,9 +31,18 @@
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
+class CLGEMMReshapeRHSMatrixKernel;
+class CLGEMMMatrixMultiplyKernel;
+class CLGEMMMatrixMultiplyReshapedKernel;
+class CLGEMMMatrixMultiplyReshapedOnlyRHSKernel;
+class CLGEMMReshapeLHSMatrixKernel;
 class ICLTensor;
+class ITensorInfo;
 
 namespace weights_transformations
 {
@@ -46,41 +50,36 @@ namespace weights_transformations
 class CLGEMMReshapeRHSMatrixKernelManaged : public ITransformWeights
 {
 public:
+    /** Default constructor */
+    CLGEMMReshapeRHSMatrixKernelManaged();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapeRHSMatrixKernelManaged(const CLGEMMReshapeRHSMatrixKernelManaged &) = delete;
+    /** Default move constructor */
+    CLGEMMReshapeRHSMatrixKernelManaged(CLGEMMReshapeRHSMatrixKernelManaged &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapeRHSMatrixKernelManaged &operator=(const CLGEMMReshapeRHSMatrixKernelManaged &) = delete;
+    /** Default move assignment operator */
+    CLGEMMReshapeRHSMatrixKernelManaged &operator=(CLGEMMReshapeRHSMatrixKernelManaged &&) = default;
+    /** Default desctructor */
+    ~CLGEMMReshapeRHSMatrixKernelManaged();
     //Inherited method override
-    void run() override
-    {
-        _output.allocator()->allocate();
-        CLScheduler::get().enqueue(_kernel, false);
-        _reshape_run = true;
-    }
+    void run() override;
 
     //Inherited method override
-    void release() override
-    {
-        _output.allocator()->free();
-    }
+    void release() override;
 
     //Inherited method override
-    ICLTensor *get_weights() override
-    {
-        return &_output;
-    }
+    ICLTensor *get_weights() override;
 
     //Inherited method override
-    uint32_t uid() override
-    {
-        return _uid;
-    }
+    uint32_t uid() override;
 
     /** Configures the @ref CLGEMMReshapeRHSMatrixKernel kernel
      *
      * @param[in] input Input tensor. Data types supported: All
      * @param[in] info  RHS matrix information to be used for reshaping.
      */
-    void configure(const ICLTensor *input, GEMMRHSMatrixInfo info)
-    {
-        configure(CLKernelLibrary::get().get_compile_context(), input, info);
-    }
+    void configure(const ICLTensor *input, GEMMRHSMatrixInfo info);
 
     /** Configures the @ref CLGEMMReshapeRHSMatrixKernel kernel
      *
@@ -88,15 +87,12 @@ class CLGEMMReshapeRHSMatrixKernelManaged : public ITransformWeights
      * @param[in] input           Input tensor. Data types supported: All
      * @param[in] info            RHS matrix information to be used for reshaping.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, GEMMRHSMatrixInfo info)
-    {
-        _kernel.configure(compile_context, input, &_output, info);
-    }
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, GEMMRHSMatrixInfo info);
 
 private:
-    static constexpr uint32_t    _uid = 0x15;
-    CLTensor                     _output{};
-    CLGEMMReshapeRHSMatrixKernel _kernel{};
+    static constexpr uint32_t                     _uid{ 0x15 };
+    CLTensor                                      _output{};
+    std::unique_ptr<CLGEMMReshapeRHSMatrixKernel> _kernel;
 };
 } // namespace weights_transformations
 
@@ -126,6 +122,8 @@ class CLGEMM : public IFunction
     CLGEMM &operator=(const CLGEMM &) = delete;
     /** Default move assignment operator */
     CLGEMM &operator=(CLGEMM &&) = default;
+    /** Default destructor */
+    ~CLGEMM();
     /** Initialise the kernel's inputs and output
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -185,7 +183,7 @@ class CLGEMM : public IFunction
     void prepare() override;
 
 private:
-    static CLGEMMKernelType select_gemm_kernel(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run);
+    static CLGEMMKernelType select_gemm_kernel(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool reshape_b_only_on_first_run);
 
     void configure_native_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
     void configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
@@ -198,20 +196,23 @@ class CLGEMM : public IFunction
     static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
     static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
 
-    MemoryGroup                                                  _memory_group;
-    IWeightsManager                                             *_weights_manager;
-    CLGEMMMatrixMultiplyKernel                                   _mm_kernel;
-    CLGEMMReshapeLHSMatrixKernel                                 _reshape_lhs_kernel;
-    CLGEMMReshapeRHSMatrixKernel                                 _reshape_rhs_kernel;
-    weights_transformations::CLGEMMReshapeRHSMatrixKernelManaged _reshape_rhs_kernel_managed;
-    CLGEMMMatrixMultiplyReshapedKernel                           _mm_reshaped_kernel;
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel                    _mm_reshaped_only_rhs_kernel;
-    CLTensor                                                     _tmp_a;
-    CLTensor                                                     _tmp_b;
-    const ICLTensor                                             *_original_b;
-    bool                                                         _reshape_b_only_on_first_run;
-    bool                                                         _is_prepared;
-    CLGEMMKernelType                                             _gemm_kernel_type;
+    MemoryGroup                                                                   _memory_group;
+    IWeightsManager                                                              *_weights_manager;
+    std::unique_ptr<CLGEMMMatrixMultiplyKernel>                                   _mm_kernel;
+    std::unique_ptr<CLGEMMReshapeLHSMatrixKernel>                                 _reshape_lhs_kernel;
+    std::unique_ptr<CLGEMMReshapeRHSMatrixKernel>                                 _reshape_rhs_kernel;
+    std::unique_ptr<weights_transformations::CLGEMMReshapeRHSMatrixKernelManaged> _reshape_rhs_kernel_managed;
+    std::unique_ptr<CLGEMMMatrixMultiplyReshapedKernel>                           _mm_reshaped_kernel;
+    std::unique_ptr<CLGEMMMatrixMultiplyReshapedOnlyRHSKernel>                    _mm_reshaped_only_rhs_kernel;
+    std::unique_ptr<CLGEMMMatrixMultiplyReshapedOnlyRHSKernel>                    _mm_reshaped_only_rhs_fallback_kernel;
+    CLTensor                                                                      _tmp_a;
+    CLTensor                                                                      _tmp_b;
+    const ICLTensor                                                              *_original_b;
+    const ICLTensor                                                              *_lhs;
+    ICLTensor                                                                    *_dst;
+    bool                                                                          _reshape_b_only_on_first_run;
+    bool                                                                          _is_prepared;
+    CLGEMMKernelType                                                              _gemm_kernel_type;
 };
 } // namespace arm_compute
 
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 277b27f690..4dbd0f828a 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -26,9 +26,7 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
-#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
@@ -43,6 +41,9 @@
 
 namespace arm_compute
 {
+class CLCol2ImKernel;
+class CLIm2ColKernel;
+class CLWeightsReshapeKernel;
 class ICLTensor;
 
 /** Function to reshape and transpose the weights. This function calls the following kernels:
@@ -53,6 +54,16 @@ class CLConvolutionLayerReshapeWeights : public IFunction
 public:
     /** Constructor */
     CLConvolutionLayerReshapeWeights();
+    /** Prevent instances of this class from being copied */
+    CLConvolutionLayerReshapeWeights(const CLConvolutionLayerReshapeWeights &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLConvolutionLayerReshapeWeights &operator=(const CLConvolutionLayerReshapeWeights &) = delete;
+    /** Default move constructor */
+    CLConvolutionLayerReshapeWeights(CLConvolutionLayerReshapeWeights &&) = default;
+    /** Default move assignment operator */
+    CLConvolutionLayerReshapeWeights &operator=(CLConvolutionLayerReshapeWeights &&) = default;
+    /** Default destructor */
+    ~CLConvolutionLayerReshapeWeights();
     /** Set the input and output tensors.
      *
      * @param[in]  weights    Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
@@ -87,7 +98,7 @@ class CLConvolutionLayerReshapeWeights : public IFunction
     void run() override;
 
 private:
-    CLWeightsReshapeKernel _weights_reshape_kernel;
+    std::unique_ptr<CLWeightsReshapeKernel> _weights_reshape_kernel;
 };
 
 namespace weights_transformations
@@ -158,8 +169,8 @@ class CLConvolutionLayerReshapeWeightsTransform : public ITransformWeights
  *
  * -# @ref CLIm2ColKernel
  * -# @ref CLGEMM (if the data type is FP32 or FP16)
- * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8)
- * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8)
+ * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref CLGEMMLowpOutputStage with QUANTIZE_DOWN_FIXEDPOINT type of quantization (if the data type is QASYMM8/QASYMM8_SIGNED)
  * -# @ref CLCol2ImKernel (if NCHW data layout)
  */
 class CLGEMMConvolutionLayer : public IFunction
@@ -179,6 +190,8 @@ class CLGEMMConvolutionLayer : public IFunction
     CLGEMMConvolutionLayer &operator=(const CLGEMMConvolutionLayer &) = delete;
     /** Default move assignment operator */
     CLGEMMConvolutionLayer &operator=(CLGEMMConvolutionLayer &&) = default;
+    /**Default destructor */
+    ~CLGEMMConvolutionLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -288,10 +301,10 @@ class CLGEMMConvolutionLayer : public IFunction
     IWeightsManager                                                   *_weights_manager;
     CLConvolutionLayerReshapeWeights                                   _reshape_weights;
     weights_transformations::CLConvolutionLayerReshapeWeightsTransform _reshape_weights_managed;
-    CLIm2ColKernel                                                     _im2col_kernel;
+    std::unique_ptr<CLIm2ColKernel>                                    _im2col_kernel;
     CLGEMM                                                             _mm_gemm;
     CLGEMMLowpMatrixMultiplyCore                                       _mm_gemmlowp;
-    CLCol2ImKernel                                                     _col2im_kernel;
+    std::unique_ptr<CLCol2ImKernel>                                    _col2im_kernel;
     CLActivationLayer                                                  _activationlayer_function;
 
     const ICLTensor *_original_weights;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
index 1fedeff444..32af0f9427 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H
 #define ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
@@ -40,6 +39,7 @@
 
 namespace arm_compute
 {
+class CLDeconvolutionReshapeOutputKernel;
 class ICLTensor;
 /** Function to run the deconvolution layer through a call to GEMM.
  *
@@ -89,6 +89,8 @@ class CLGEMMDeconvolutionLayer : public IFunction
     CLGEMMDeconvolutionLayer &operator=(const CLGEMMDeconvolutionLayer &) = delete;
     /** Default move assignment operator */
     CLGEMMDeconvolutionLayer &operator=(CLGEMMDeconvolutionLayer &&) = default;
+    /** Default desctructor */
+    ~CLGEMMDeconvolutionLayer();
     /** Set the input, weights, biases and output tensors.
      *
      * @param[in,out] input       Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
@@ -130,15 +132,15 @@ class CLGEMMDeconvolutionLayer : public IFunction
 private:
     MemoryGroup _memory_group;
 
-    CLGEMM                             _mm_gemm;
-    CLGEMMLowpMatrixMultiplyCore       _mm_gemmlowp;
-    CLGEMMLowpOutputStage              _gemmlowp_output_stage;
-    CLPermute                          _permute_input_to_nhwc;
-    CLPermute                          _permute_weights_to_nhwc;
-    CLReshapeLayer                     _reshape_weights;
-    CLTranspose                        _transpose_weights;
-    CLDeconvolutionReshapeOutputKernel _deconv_reshape;
-    CLSlice                            _slice_gemm;
+    CLGEMM                                              _mm_gemm;
+    CLGEMMLowpMatrixMultiplyCore                        _mm_gemmlowp;
+    CLGEMMLowpOutputStage                               _gemmlowp_output_stage;
+    CLPermute                                           _permute_input_to_nhwc;
+    CLPermute                                           _permute_weights_to_nhwc;
+    CLReshapeLayer                                      _reshape_weights;
+    CLTranspose                                         _transpose_weights;
+    std::unique_ptr<CLDeconvolutionReshapeOutputKernel> _deconv_reshape;
+    CLSlice                                             _slice_gemm;
 
     CLTensor _gemmlowp_final;
     CLTensor _reshaped_weights;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index 57b1e30df5..4cc8899690 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -24,21 +24,24 @@
 #ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H
 #define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H
 
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class IMemoryManager;
 class ICLTensor;
+class ITensorInfo;
+class CLDepthConvertLayerKernel;
+class CLGEMMLowpMatrixMultiplyNativeKernel;
+class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel;
+class CLGEMMLowpOffsetContributionKernel;
+class CLGEMMLowpOffsetContributionOutputStageKernel;
+class CLGEMMLowpMatrixAReductionKernel;
+class CLGEMMLowpMatrixBReductionKernel;
+class CLGEMMReshapeRHSMatrixKernel;
 
 /** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */
 class CLGEMMLowpMatrixMultiplyCore : public IFunction
@@ -54,6 +57,8 @@ class CLGEMMLowpMatrixMultiplyCore : public IFunction
     CLGEMMLowpMatrixMultiplyCore &operator=(const CLGEMMLowpMatrixMultiplyCore &) = delete;
     /** Default move assignment operator */
     CLGEMMLowpMatrixMultiplyCore &operator=(CLGEMMLowpMatrixMultiplyCore &&) = default;
+    /** Default destructor */
+    ~CLGEMMLowpMatrixMultiplyCore();
     /** Initialise the kernel's inputs, output
      *
      * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
@@ -112,14 +117,14 @@ class CLGEMMLowpMatrixMultiplyCore : public IFunction
     MemoryGroup _memory_group;
 
     // Kernels used
-    CLDepthConvertLayerKernel                     _weights_to_qasymm8;
-    CLGEMMLowpMatrixMultiplyNativeKernel          _mm_native_kernel;
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_kernel;
-    CLGEMMReshapeRHSMatrixKernel                  _mtx_b_reshape_kernel;
-    CLGEMMLowpMatrixAReductionKernel              _mtx_a_reduction_kernel;
-    CLGEMMLowpMatrixBReductionKernel              _mtx_b_reduction_kernel;
-    CLGEMMLowpOffsetContributionKernel            _offset_contribution_kernel;
-    CLGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
+    std::unique_ptr<CLDepthConvertLayerKernel>                     _weights_to_qasymm8;
+    std::unique_ptr<CLGEMMLowpMatrixMultiplyNativeKernel>          _mm_native_kernel;
+    std::unique_ptr<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel> _mm_reshaped_only_rhs_kernel;
+    std::unique_ptr<CLGEMMReshapeRHSMatrixKernel>                  _mtx_b_reshape_kernel;
+    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
+    std::unique_ptr<CLGEMMLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
+    std::unique_ptr<CLGEMMLowpOffsetContributionKernel>            _offset_contribution_kernel;
+    std::unique_ptr<CLGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
 
     // Temporary tensors
     CLTensor _qasymm8_weights;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
index c6e95888e5..a4edab9b8f 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -24,8 +24,11 @@
 #ifndef ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H
 #define ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
+#include <limits>
+
 /** This file contains all available output stages for GEMMLowp on OpenCL.
  *
  *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore),
@@ -36,7 +39,11 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ITensor;
+class ICLTensor;
+class ITensorInfo;
+struct GEMMLowpOutputStageInfo;
 
 /** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL.
  *
@@ -61,7 +68,7 @@ class ITensor;
  *
  *  This function calls the following OpenCL kernels:
  *
- * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
  *
  * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
  *       after the result is shifted right by result_shift
@@ -139,7 +146,7 @@ class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public ICLSimpleFunc
  *
  *  This function calls the following OpenCL kernels:
  *
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+ * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
  *
  * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
  *       after the result is shifted right by result_shift
@@ -217,7 +224,7 @@ class CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public ICLSimpleFunct
  *
  *  This function calls the following NEON kernels:
  *
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+ * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
  *
  * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
  *       after the result is shifted right by result_shift
@@ -274,9 +281,7 @@ class CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public ICLSimpleFunc
  *
  * -# @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
  * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+ * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
 */
 class CLGEMMLowpOutputStage : public ICLSimpleFunction
 {
@@ -300,7 +305,7 @@ class CLGEMMLowpOutputStage : public ICLSimpleFunction
      * @param[in]  info            GEMMLowp output stage metadata.
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
      *
      * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
      * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
diff --git a/arm_compute/runtime/CL/functions/CLGather.h b/arm_compute/runtime/CL/functions/CLGather.h
index e87a120ba1..9c659be6fc 100644
--- a/arm_compute/runtime/CL/functions/CLGather.h
+++ b/arm_compute/runtime/CL/functions/CLGather.h
@@ -25,11 +25,14 @@
 #ifndef ARM_COMPUTE_CLGATHER_H
 #define ARM_COMPUTE_CLGATHER_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLGatherKernel */
 class CLGather : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLGaussian3x3.h b/arm_compute/runtime/CL/functions/CLGaussian3x3.h
index 9fe3e9bb00..20ce2b4bea 100644
--- a/arm_compute/runtime/CL/functions/CLGaussian3x3.h
+++ b/arm_compute/runtime/CL/functions/CLGaussian3x3.h
@@ -31,6 +31,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to execute gaussian filter 3x3. This function calls the following OpenCL kernels:
@@ -38,6 +39,8 @@ class ICLTensor;
  * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref CLGaussian3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLGaussian3x3 : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLGaussian5x5.h b/arm_compute/runtime/CL/functions/CLGaussian5x5.h
index fb369d750b..d08cef21c3 100644
--- a/arm_compute/runtime/CL/functions/CLGaussian5x5.h
+++ b/arm_compute/runtime/CL/functions/CLGaussian5x5.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLGAUSSIAN5X5_H
 #define ARM_COMPUTE_CLGAUSSIAN5X5_H
 
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -37,6 +35,10 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLFillBorderKernel;
+class CLGaussian5x5HorKernel;
+class CLGaussian5x5VertKernel;
 class ICLTensor;
 
 /** Basic function to execute gaussian filter 5x5. This function calls the following OpenCL kernels:
@@ -45,6 +47,8 @@ class ICLTensor;
  * -# @ref CLGaussian5x5HorKernel
  * -# @ref CLGaussian5x5VertKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLGaussian5x5 : public IFunction
 {
@@ -54,6 +58,16 @@ class CLGaussian5x5 : public IFunction
      * @param[in] memory_manager (Optional) Memory manager.
      */
     CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLGaussian5x5(const CLGaussian5x5 &) = delete;
+    /** Default move constructor */
+    CLGaussian5x5(CLGaussian5x5 &&) = default;
+    /** Prevent instances of this class from being copied */
+    CLGaussian5x5 &operator=(const CLGaussian5x5 &) = delete;
+    /** Default move assignment operator */
+    CLGaussian5x5 &operator=(CLGaussian5x5 &&) = default;
+    /** Default destructor */
+    ~CLGaussian5x5();
     /** Initialise the function's source, destinations and border mode.
      *
      * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -76,11 +90,11 @@ class CLGaussian5x5 : public IFunction
     void run() override;
 
 protected:
-    MemoryGroup             _memory_group;   /**< Function's memory group */
-    CLGaussian5x5HorKernel  _kernel_hor;     /**< Horizontal pass kernel */
-    CLGaussian5x5VertKernel _kernel_vert;    /**< Vertical pass kernel */
-    CLFillBorderKernel      _border_handler; /**< Kernel to handle image borders */
-    CLImage                 _tmp;            /**< Temporary buffer */
+    MemoryGroup                              _memory_group;   /**< Function's memory group */
+    std::unique_ptr<CLGaussian5x5HorKernel>  _kernel_hor;     /**< Horizontal pass kernel */
+    std::unique_ptr<CLGaussian5x5VertKernel> _kernel_vert;    /**< Vertical pass kernel */
+    std::unique_ptr<CLFillBorderKernel>      _border_handler; /**< Kernel to handle image borders */
+    CLImage                                  _tmp;            /**< Temporary buffer */
 };
 }
 #endif /*ARM_COMPUTE_CLGAUSSIAN5X5_H */
diff --git a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
index 70f324be11..70be6738a6 100644
--- a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
+++ b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
@@ -24,9 +24,6 @@
 #ifndef ARM_COMPUTE_CLGAUSSIANPYRAMID_H
 #define ARM_COMPUTE_CLGAUSSIANPYRAMID_H
 
-#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
-
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLPyramid.h"
@@ -38,9 +35,18 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLFillBorderKernel;
 class ICLTensor;
+class CLGaussianPyramidHorKernel;
+class CLGaussianPyramidVertKernel;
+class CLScaleKernel;
 
-/** Common interface for all Gaussian pyramid functions */
+/** Common interface for all Gaussian pyramid functions
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class CLGaussianPyramid : public IFunction
 {
 public:
@@ -55,7 +61,7 @@ class CLGaussianPyramid : public IFunction
     /** Allow instances of this class to be moved */
     CLGaussianPyramid &operator=(CLGaussianPyramid &&) = default;
     /** Default destructor */
-    virtual ~CLGaussianPyramid() = default;
+    ~CLGaussianPyramid();
     /** Initialise the function's source, destinations and border mode.
      *
      * @param[in, out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -93,6 +99,12 @@ class CLGaussianPyramidHalf : public CLGaussianPyramid
 public:
     /** Constructor */
     CLGaussianPyramidHalf();
+    /** Prevent instances of this class from being copied */
+    CLGaussianPyramidHalf(const CLGaussianPyramidHalf &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLGaussianPyramidHalf &operator=(const CLGaussianPyramidHalf &) = delete;
+    /** Default destructor */
+    ~CLGaussianPyramidHalf();
 
     // Inherited methods overridden:
     void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
@@ -100,10 +112,10 @@ class CLGaussianPyramidHalf : public CLGaussianPyramid
     void run() override;
 
 private:
-    std::vector<CLFillBorderKernel>          _horizontal_border_handler;
-    std::vector<CLFillBorderKernel>          _vertical_border_handler;
-    std::vector<CLGaussianPyramidHorKernel>  _horizontal_reduction;
-    std::vector<CLGaussianPyramidVertKernel> _vertical_reduction;
+    std::vector<std::unique_ptr<CLFillBorderKernel>>          _horizontal_border_handler;
+    std::vector<std::unique_ptr<CLFillBorderKernel>>          _vertical_border_handler;
+    std::vector<std::unique_ptr<CLGaussianPyramidHorKernel>>  _horizontal_reduction;
+    std::vector<std::unique_ptr<CLGaussianPyramidVertKernel>> _vertical_reduction;
 };
 
 /** Basic function to execute gaussian pyramid with ORB scale factor. This function calls the following OpenCL kernels and functions:
@@ -124,8 +136,8 @@ class CLGaussianPyramidOrb : public CLGaussianPyramid
     void run() override;
 
 private:
-    std::vector<CLGaussian5x5> _gauss5x5;
-    std::vector<CLScaleKernel> _scale_nearest;
+    std::vector<CLGaussian5x5>                  _gauss5x5;
+    std::vector<std::unique_ptr<CLScaleKernel>> _scale_nearest;
 };
 }
 #endif /*ARM_COMPUTE_CLGAUSSIANPYRAMID_H */
diff --git a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
index 6d5f2e5d71..0fb9a06c84 100644
--- a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
@@ -23,12 +23,7 @@
  */
 #ifndef ARM_COMPUTE_CLGENERATEPROPOSALSLAYER_H
 #define ARM_COMPUTE_CLGENERATEPROPOSALSLAYER_H
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
-#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
@@ -38,9 +33,19 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
+class CLBoundingBoxTransformKernel;
+class CLDequantizationLayerKernel;
+class CLComputeAllAnchorsKernel;
+class CLPadLayerKernel;
+class CLPermuteKernel;
+class CLQuantizationLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to generate proposals for a RPN (Region Proposal Network)
  *
@@ -67,6 +72,8 @@ class CLGenerateProposalsLayer : public IFunction
     CLGenerateProposalsLayer(const CLGenerateProposalsLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLGenerateProposalsLayer &operator=(const CLGenerateProposalsLayer &) = delete;
+    /** Default destructor */
+    ~CLGenerateProposalsLayer();
 
     /** Set the input and output tensors.
      *
@@ -130,16 +137,16 @@ class CLGenerateProposalsLayer : public IFunction
     MemoryGroup _memory_group;
 
     // OpenCL kernels
-    CLPermuteKernel              _permute_deltas_kernel;
-    CLReshapeLayer               _flatten_deltas;
-    CLPermuteKernel              _permute_scores_kernel;
-    CLReshapeLayer               _flatten_scores;
-    CLComputeAllAnchorsKernel    _compute_anchors_kernel;
-    CLBoundingBoxTransformKernel _bounding_box_kernel;
-    CLPadLayerKernel             _pad_kernel;
-    CLDequantizationLayerKernel  _dequantize_anchors;
-    CLDequantizationLayerKernel  _dequantize_deltas;
-    CLQuantizationLayerKernel    _quantize_all_proposals;
+    std::unique_ptr<CLPermuteKernel>              _permute_deltas_kernel;
+    CLReshapeLayer                                _flatten_deltas;
+    std::unique_ptr<CLPermuteKernel>              _permute_scores_kernel;
+    CLReshapeLayer                                _flatten_scores;
+    std::unique_ptr<CLComputeAllAnchorsKernel>    _compute_anchors_kernel;
+    std::unique_ptr<CLBoundingBoxTransformKernel> _bounding_box_kernel;
+    std::unique_ptr<CLPadLayerKernel>             _pad_kernel;
+    std::unique_ptr<CLDequantizationLayerKernel>  _dequantize_anchors;
+    std::unique_ptr<CLDequantizationLayerKernel>  _dequantize_deltas;
+    std::unique_ptr<CLQuantizationLayerKernel>    _quantize_all_proposals;
 
     // CPP functions
     CPPBoxWithNonMaximaSuppressionLimit _cpp_nms;
diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
index dad7e6edf8..87bcd7f49e 100644
--- a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
+++ b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_CLHOGDESCRIPTOR_H
 #define ARM_COMPUTE_CLHOGDESCRIPTOR_H
 
-#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
@@ -37,18 +36,28 @@
 namespace arm_compute
 {
 class IHOG;
+class CLHOGOrientationBinningKernel;
+class CLHOGBlockNormalizationKernel;
 /** Basic function to calculate HOG descriptor. This function calls the following OpenCL kernels:
  *
  * -# @ref CLHOGGradient
  * -# @ref CLHOGOrientationBinningKernel
  * -# @ref CLHOGBlockNormalizationKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLHOGDescriptor : public IFunction
 {
 public:
     /** Default constructor */
     CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLHOGDescriptor(const CLHOGDescriptor &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLHOGDescriptor &operator=(const CLHOGDescriptor &) = delete;
+    /** Default destructor */
+    ~CLHOGDescriptor();
     /** Initialise the function's source, destination, HOG data-object and border mode
      *
      * @param[in, out] input                 Input tensor. Data type supported: U8
@@ -75,13 +84,13 @@ class CLHOGDescriptor : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                   _memory_group;
-    CLHOGGradient                 _gradient;
-    CLHOGOrientationBinningKernel _orient_bin;
-    CLHOGBlockNormalizationKernel _block_norm;
-    CLTensor                      _mag;
-    CLTensor                      _phase;
-    CLTensor                      _hog_space;
+    MemoryGroup                                    _memory_group;
+    CLHOGGradient                                  _gradient;
+    std::unique_ptr<CLHOGOrientationBinningKernel> _orient_bin;
+    std::unique_ptr<CLHOGBlockNormalizationKernel> _block_norm;
+    CLTensor                                       _mag;
+    CLTensor                                       _phase;
+    CLTensor                                       _hog_space;
 };
 }
 
diff --git a/arm_compute/runtime/CL/functions/CLHOGDetector.h b/arm_compute/runtime/CL/functions/CLHOGDetector.h
index 6697b5c24d..539a521797 100644
--- a/arm_compute/runtime/CL/functions/CLHOGDetector.h
+++ b/arm_compute/runtime/CL/functions/CLHOGDetector.h
@@ -24,17 +24,26 @@
 #ifndef ARM_COMPUTE_CLHOGDETECTOR_H
 #define ARM_COMPUTE_CLHOGDETECTOR_H
 
+#include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
 #include "arm_compute/core/IHOG.h"
 #include "arm_compute/runtime/IFunction.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
+class CLHOGDetectorKernel;
+class ICLTensor;
+class ICLHOG;
+
 /** Basic function to execute HOG detector based on linear SVM. This function calls the following OpenCL kernel:
  *
  * -# @ref CLHOGDetectorKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLHOGDetector : public IFunction
 {
@@ -50,7 +59,7 @@ class CLHOGDetector : public IFunction
     /** Allow instances of this class to be moved */
     CLHOGDetector &operator=(CLHOGDetector &&) = default;
     /** Default destructor */
-    ~CLHOGDetector() = default;
+    ~CLHOGDetector();
     /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
      *
      * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
@@ -78,16 +87,16 @@ class CLHOGDetector : public IFunction
      * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride,
-                   float threshold = 0.0f,
+                   float  threshold = 0.0f,
                    size_t idx_class = 0);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    CLHOGDetectorKernel      _hog_detector_kernel;
-    ICLDetectionWindowArray *_detection_windows;
-    cl::Buffer               _num_detection_windows;
+    std::unique_ptr<CLHOGDetectorKernel> _hog_detector_kernel;
+    ICLDetectionWindowArray             *_detection_windows;
+    cl::Buffer                           _num_detection_windows;
 };
 }
 
diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h
index b0589027e7..569490f333 100644
--- a/arm_compute/runtime/CL/functions/CLHOGGradient.h
+++ b/arm_compute/runtime/CL/functions/CLHOGGradient.h
@@ -24,9 +24,6 @@
 #ifndef ARM_COMPUTE_CLHOGGRADIENT_H
 #define ARM_COMPUTE_CLHOGGRADIENT_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLDerivative.h"
@@ -39,11 +36,16 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLMagnitudePhaseKernel;
+class ITensorInfo;
 /** Basic function to calculate the gradient for HOG. This function calls the following OpenCL kernels:
  *
  * -# @ref CLDerivative
  * -# @ref CLMagnitudePhaseKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLHOGGradient : public IFunction
 {
@@ -79,11 +81,11 @@ class CLHOGGradient : public IFunction
     void run() override;
 
 private:
-    MemoryGroup            _memory_group;
-    CLDerivative           _derivative;
-    CLMagnitudePhaseKernel _mag_phase;
-    CLTensor               _gx;
-    CLTensor               _gy;
+    MemoryGroup                             _memory_group;
+    CLDerivative                            _derivative;
+    std::unique_ptr<CLMagnitudePhaseKernel> _mag_phase;
+    CLTensor                                _gx;
+    CLTensor                                _gy;
 };
 }
 #endif /*ARM_COMPUTE_CLHOGGRADIENT_H */
diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
index e7631c2c5a..b9a51653f2 100644
--- a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
+++ b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLMultiHOG.h"
-#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
@@ -39,6 +38,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLHOGOrientationBinningKernel;
+class CLHOGBlockNormalizationKernel;
 /** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following kernels:
  *
  * -# @ref CLHOGGradient
@@ -52,6 +54,9 @@ namespace arm_compute
          -# Normalization type
          -# L2 hysteresis threshold if the normalization type is L2HYS_NORM
  *
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLHOGMultiDetection : public IFunction
 {
@@ -62,6 +67,8 @@ class CLHOGMultiDetection : public IFunction
     CLHOGMultiDetection(const CLHOGMultiDetection &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLHOGMultiDetection &operator=(const CLHOGMultiDetection &) = delete;
+    /** Default destructor */
+    ~CLHOGMultiDetection();
     /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
      *
      * @param[in, out] input                    Input tensor. Data type supported: U8
@@ -110,22 +117,22 @@ class CLHOGMultiDetection : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                                  _memory_group;
-    CLHOGGradient                                _gradient_kernel;
-    std::vector<CLHOGOrientationBinningKernel>   _orient_bin_kernel;
-    std::vector<CLHOGBlockNormalizationKernel>   _block_norm_kernel;
-    std::vector<CLHOGDetector>                   _hog_detect_kernel;
-    CPPDetectionWindowNonMaximaSuppressionKernel _non_maxima_kernel;
-    std::vector<CLTensor>                        _hog_space;
-    std::vector<CLTensor>                        _hog_norm_space;
-    ICLDetectionWindowArray                     *_detection_windows;
-    CLTensor                                     _mag;
-    CLTensor                                     _phase;
-    bool                                         _non_maxima_suppression;
-    size_t                                       _num_orient_bin_kernel;
-    size_t                                       _num_block_norm_kernel;
-    size_t                                       _num_hog_detect_kernel;
+    MemoryGroup                                                 _memory_group;
+    CLHOGGradient                                               _gradient_kernel;
+    std::vector<std::unique_ptr<CLHOGOrientationBinningKernel>> _orient_bin_kernel;
+    std::vector<std::unique_ptr<CLHOGBlockNormalizationKernel>> _block_norm_kernel;
+    std::vector<CLHOGDetector>                                  _hog_detect_kernel;
+    CPPDetectionWindowNonMaximaSuppressionKernel                _non_maxima_kernel;
+    std::vector<CLTensor>                                       _hog_space;
+    std::vector<CLTensor>                                       _hog_norm_space;
+    ICLDetectionWindowArray                                    *_detection_windows;
+    CLTensor                                                    _mag;
+    CLTensor                                                    _phase;
+    bool                                                        _non_maxima_suppression;
+    size_t                                                      _num_orient_bin_kernel;
+    size_t                                                      _num_block_norm_kernel;
+    size_t                                                      _num_hog_detect_kernel;
 };
 }
 
-#endif /* ARM_COMPUTE_CLHOGMULTIDETECTION_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CLHOGMULTIDETECTION_H */
diff --git a/arm_compute/runtime/CL/functions/CLHarrisCorners.h b/arm_compute/runtime/CL/functions/CLHarrisCorners.h
index 90d8c8873f..fc25c218ad 100644
--- a/arm_compute/runtime/CL/functions/CLHarrisCorners.h
+++ b/arm_compute/runtime/CL/functions/CLHarrisCorners.h
@@ -24,23 +24,23 @@
 #ifndef ARM_COMPUTE_CLHARRISCORNERS_H
 #define ARM_COMPUTE_CLHARRISCORNERS_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-
 #include <cstdint>
 #include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLHarrisScoreKernel;
+class CLFillBorderKernel;
 class ICLTensor;
 using ICLImage = ICLTensor;
 
@@ -56,6 +56,9 @@ using ICLImage = ICLTensor;
  * -# @ref CLNonMaximaSuppression3x3
  * -# @ref CPPCornerCandidatesKernel
  * -# @ref CPPSortEuclideanDistanceKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLHarrisCorners : public IFunction
 {
@@ -66,6 +69,8 @@ class CLHarrisCorners : public IFunction
     CLHarrisCorners(const CLHarrisCorners &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     const CLHarrisCorners &operator=(const CLHarrisCorners &) = delete;
+    /** Default destructor */
+    ~CLHarrisCorners();
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in,out] input                 Source image. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -104,21 +109,21 @@ class CLHarrisCorners : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                    _memory_group;          /**< Function's memory group */
-    std::unique_ptr<IFunction>     _sobel;                 /**< Sobel function */
-    CLHarrisScoreKernel            _harris_score;          /**< Harris score kernel */
-    CLNonMaximaSuppression3x3      _non_max_suppr;         /**< Non-maxima suppression function */
-    CPPCornerCandidatesKernel      _candidates;            /**< Sort kernel */
-    CPPSortEuclideanDistanceKernel _sort_euclidean;        /**< Euclidean distance kernel */
-    CLFillBorderKernel             _border_gx;             /**< Border handler before running harris score */
-    CLFillBorderKernel             _border_gy;             /**< Border handler before running harris score */
-    CLImage                        _gx;                    /**< Source image - Gx component */
-    CLImage                        _gy;                    /**< Source image - Gy component */
-    CLImage                        _score;                 /**< Source image - Harris score */
-    CLImage                        _nonmax;                /**< Source image - Non-Maxima suppressed image */
-    std::vector<InternalKeypoint>  _corners_list;          /**< Array of InternalKeypoint. It stores the potential corner candidates */
-    int32_t                        _num_corner_candidates; /**< Number of potential corner candidates */
-    ICLKeyPointArray              *_corners;               /**< Output corners array */
+    MemoryGroup                          _memory_group;          /**< Function's memory group */
+    std::unique_ptr<IFunction>           _sobel;                 /**< Sobel function */
+    std::unique_ptr<CLHarrisScoreKernel> _harris_score;          /**< Harris score kernel */
+    CLNonMaximaSuppression3x3            _non_max_suppr;         /**< Non-maxima suppression function */
+    CPPCornerCandidatesKernel            _candidates;            /**< Sort kernel */
+    CPPSortEuclideanDistanceKernel       _sort_euclidean;        /**< Euclidean distance kernel */
+    std::unique_ptr<CLFillBorderKernel>  _border_gx;             /**< Border handler before running harris score */
+    std::unique_ptr<CLFillBorderKernel>  _border_gy;             /**< Border handler before running harris score */
+    CLImage                              _gx;                    /**< Source image - Gx component */
+    CLImage                              _gy;                    /**< Source image - Gy component */
+    CLImage                              _score;                 /**< Source image - Harris score */
+    CLImage                              _nonmax;                /**< Source image - Non-Maxima suppressed image */
+    std::vector<InternalKeypoint>        _corners_list;          /**< Array of InternalKeypoint. It stores the potential corner candidates */
+    int32_t                              _num_corner_candidates; /**< Number of potential corner candidates */
+    ICLKeyPointArray                    *_corners;               /**< Output corners array */
 };
 }
 #endif /*ARM_COMPUTE_CLHARRISCORNERS_H */
diff --git a/arm_compute/runtime/CL/functions/CLHistogram.h b/arm_compute/runtime/CL/functions/CLHistogram.h
index 7fdb8a9022..b45a79e10e 100644
--- a/arm_compute/runtime/CL/functions/CLHistogram.h
+++ b/arm_compute/runtime/CL/functions/CLHistogram.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLHISTOGRAM_H
 #define ARM_COMPUTE_CLHISTOGRAM_H
 
-#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "src/core/CL/kernels/CLHistogramKernel.h"
 
 namespace arm_compute
 {
@@ -37,6 +37,8 @@ class ICLTensor;
  *  -# @ref CLHistogramKernel
  *  -# @ref CLHistogramBorderKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLHistogram : public IFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
index d7aa11cbc8..d41f3fedf6 100644
--- a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
@@ -24,11 +24,14 @@
 #ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYER_H
 #define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYER_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to perform a Instance normalization.
  *
diff --git a/arm_compute/runtime/CL/functions/CLIntegralImage.h b/arm_compute/runtime/CL/functions/CLIntegralImage.h
index 6b10ede650..b6c98dc9ab 100644
--- a/arm_compute/runtime/CL/functions/CLIntegralImage.h
+++ b/arm_compute/runtime/CL/functions/CLIntegralImage.h
@@ -24,11 +24,15 @@
 #ifndef ARM_COMPUTE_CLINTEGRALIMAGE_H
 #define ARM_COMPUTE_CLINTEGRALIMAGE_H
 
-#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
+class CLIntegralImageHorKernel;
+class CLIntegralImageVertKernel;
 class ICLTensor;
 
 /** Basic function to execute integral image. This function calls the following OpenCL kernels:
@@ -36,12 +40,20 @@ class ICLTensor;
  * -# @ref CLIntegralImageHorKernel
  * -# @ref CLIntegralImageVertKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLIntegralImage : public IFunction
 {
 public:
     /** Default Constructor. */
     CLIntegralImage();
+    /** Prevent instances of this class from being copied */
+    CLIntegralImage(const CLIntegralImage &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLIntegralImage &operator=(const CLIntegralImage &) = delete;
+    /** Default destructor */
+    ~CLIntegralImage();
     /** Initialise the function's source, destinations and border mode.
      *
      * @param[in]  input  Source tensor. Data types supported: U8.
@@ -60,8 +72,8 @@ class CLIntegralImage : public IFunction
     void run() override;
 
 protected:
-    CLIntegralImageHorKernel  _integral_hor;  /**< Integral Image Horizontal kernel */
-    CLIntegralImageVertKernel _integral_vert; /**< Integral Image Vertical kernel */
+    std::unique_ptr<CLIntegralImageHorKernel>  _integral_hor;  /**< Integral Image Horizontal kernel */
+    std::unique_ptr<CLIntegralImageVertKernel> _integral_vert; /**< Integral Image Vertical kernel */
 };
 }
 #endif /*ARM_COMPUTE_CLINTEGRALIMAGE_H */
diff --git a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
index bc79101d9d..401d249eb4 100644
--- a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_CLL2NORMALIZELAYER_H
 #define ARM_COMPUTE_CLL2NORMALIZELAYER_H
 
-#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
@@ -37,7 +36,10 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLL2NormalizeLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to perform a L2 normalization on a given axis.
  *
@@ -50,6 +52,16 @@ class CLL2NormalizeLayer : public IFunction
 public:
     /** Constructor */
     CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Default Destructor */
+    ~CLL2NormalizeLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLL2NormalizeLayer(const CLL2NormalizeLayer &) = delete;
+    /** Default move constructor */
+    CLL2NormalizeLayer(CLL2NormalizeLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLL2NormalizeLayer &operator=(const CLL2NormalizeLayer &) = delete;
+    /** Default move assignment operator */
+    CLL2NormalizeLayer &operator=(CLL2NormalizeLayer &&) = default;
 
     /** Set the input and output tensors.
      *
@@ -84,10 +96,10 @@ class CLL2NormalizeLayer : public IFunction
     void run() override;
 
 private:
-    MemoryGroup              _memory_group;
-    CLReductionOperation     _reduce_func;
-    CLL2NormalizeLayerKernel _normalize_kernel;
-    CLTensor                 _sumsq;
+    MemoryGroup                               _memory_group;
+    CLReductionOperation                      _reduce_func;
+    std::unique_ptr<CLL2NormalizeLayerKernel> _normalize_kernel;
+    CLTensor                                  _sumsq;
 };
 }
 #endif /*ARM_COMPUTE_CLL2NORMALIZELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
index 1a8b33463d..017f26aa1e 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
@@ -26,8 +26,6 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
@@ -45,6 +43,10 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLCopyKernel;
+class CLMemsetKernel;
+class CLTransposeKernel;
 class ICLTensor;
 
 /** This function performs a single time step in a Long Short-Term Memory (LSTM) layer.
@@ -55,6 +57,16 @@ class CLLSTMLayer : public IFunction
 public:
     /** Default constructor */
     CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLLSTMLayer(const CLLSTMLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLLSTMLayer &operator=(const CLLSTMLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLLSTMLayer(CLLSTMLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLLSTMLayer &operator=(CLLSTMLayer &&) = delete;
+    /** Default destructor */
+    ~CLLSTMLayer();
     /** Initialize function's tensors.
      *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: F16/F32.
@@ -200,90 +212,90 @@ class CLLSTMLayer : public IFunction
     void prepare() override;
 
 private:
-    MemoryGroup                    _memory_group;
-    CLFullyConnectedLayer          _fully_connected_input_gate;
-    CLArithmeticAddition           _accum_input_gate1;
-    CLArithmeticSubtraction        _subtract_input_gate;
-    CLPixelWiseMultiplication      _pixelwise_mul_input_gate;
-    CLActivationLayer              _activation_input_gate;
-    CLFullyConnectedLayer          _fully_connected_forget_gate;
-    CLArithmeticAddition           _accum_forget_gate1;
-    CLPixelWiseMultiplication      _pixelwise_mul_forget_gate;
-    CLActivationLayer              _activation_forget_gate;
-    CLFullyConnectedLayer          _fully_connected_cell_state;
-    CLGEMM                         _gemm_cell_state1;
-    CLTransposeKernel              _transpose_cell_state;
-    CLArithmeticAddition           _accum_cell_state1;
-    CLArithmeticAddition           _accum_cell_state2;
-    CLPixelWiseMultiplication      _pixelwise_mul_cell_state1;
-    CLActivationLayer              _activation_cell_state;
-    CLActivationLayer              _cell_clip;
-    CLPixelWiseMultiplication      _pixelwise_mul_cell_state2;
-    CLFullyConnectedLayer          _fully_connected_output;
-    CLPixelWiseMultiplication      _pixelwise_mul_output_state1;
-    CLArithmeticAddition           _accum_output1;
-    CLActivationLayer              _activation_output;
-    CLActivationLayer              _activation_output_state;
-    CLPixelWiseMultiplication      _pixelwise_mul_output_state2;
-    CLFullyConnectedLayer          _fully_connected_output_state;
-    CLActivationLayer              _projection_clip;
-    CLCopyKernel                   _copy_cell_state;
-    CLCopyKernel                   _copy_output;
-    CLConcatenateLayer             _concat_scratch_buffer;
-    CLConcatenateLayer             _concat_inputs_forget_gate;
-    CLConcatenateLayer             _concat_weights_forget_gate;
-    CLConcatenateLayer             _concat_weights_input_gate;
-    CLConcatenateLayer             _concat_weights_output;
-    CLMemsetKernel                 _ones_memset_kernel;
-    CLMeanStdDevNormalizationLayer _mean_std_norm_input_gate;
-    CLPixelWiseMultiplication      _pixelwise_mul_input_gate_coeff;
-    CLArithmeticAddition           _accum_input_gate_bias;
-    CLMeanStdDevNormalizationLayer _mean_std_norm_forget_gate;
-    CLPixelWiseMultiplication      _pixelwise_mul_forget_gate_coeff;
-    CLArithmeticAddition           _accum_forget_gate_bias;
-    CLMeanStdDevNormalizationLayer _mean_std_norm_cell_gate;
-    CLPixelWiseMultiplication      _pixelwise_mul_cell_gate_coeff;
-    CLArithmeticAddition           _accum_cell_gate_bias;
-    CLMeanStdDevNormalizationLayer _mean_std_norm_output_gate;
-    CLPixelWiseMultiplication      _pixelwise_mul_output_gate_coeff;
-    CLArithmeticAddition           _accum_output_gate_bias;
-    CLTensor                       _input_gate_out1;
-    CLTensor                       _input_gate_out2;
-    CLTensor                       _input_gate_out3;
-    CLTensor                       _input_gate_out4;
-    CLTensor                       _forget_gate_out1;
-    CLTensor                       _forget_gate_out2;
-    CLTensor                       _forget_gate_out3;
-    CLTensor                       _forget_gate_out4;
-    CLTensor                       _forget_gate_out5;
-    CLTensor                       _forget_gate_out6;
-    CLTensor                       _cell_state_out1;
-    CLTensor                       _cell_state_out2;
-    CLTensor                       _cell_state_out3;
-    CLTensor                       _cell_state_out4;
-    CLTensor                       _cell_state_out5;
-    CLTensor                       _output1;
-    CLTensor                       _output2;
-    CLTensor                       _output3;
-    CLTensor                       _output4;
-    CLTensor                       _cell_state_activation;
-    CLTensor                       _output_state1;
-    CLTensor                       _ones;
-    CLTensor                       _input_layer_norm_out1;
-    CLTensor                       _input_layer_norm_out2;
-    CLTensor                       _forget_layer_norm_out1;
-    CLTensor                       _forget_layer_norm_out2;
-    CLTensor                       _cell_layer_norm_out1;
-    CLTensor                       _cell_layer_norm_out2;
-    CLTensor                       _output_layer_norm_out1;
-    CLTensor                       _output_layer_norm_out2;
-    bool                           _run_peephole_opt;
-    bool                           _run_cifg_opt;
-    bool                           _perform_cell_clipping;
-    bool                           _has_projection_weights;
-    bool                           _perform_projection_clipping;
-    bool                           _is_prepared;
-    bool                           _is_layer_norm_lstm;
+    MemoryGroup                        _memory_group;
+    CLFullyConnectedLayer              _fully_connected_input_gate;
+    CLArithmeticAddition               _accum_input_gate1;
+    CLArithmeticSubtraction            _subtract_input_gate;
+    CLPixelWiseMultiplication          _pixelwise_mul_input_gate;
+    CLActivationLayer                  _activation_input_gate;
+    CLFullyConnectedLayer              _fully_connected_forget_gate;
+    CLArithmeticAddition               _accum_forget_gate1;
+    CLPixelWiseMultiplication          _pixelwise_mul_forget_gate;
+    CLActivationLayer                  _activation_forget_gate;
+    CLFullyConnectedLayer              _fully_connected_cell_state;
+    CLGEMM                             _gemm_cell_state1;
+    std::unique_ptr<CLTransposeKernel> _transpose_cell_state;
+    CLArithmeticAddition               _accum_cell_state1;
+    CLArithmeticAddition               _accum_cell_state2;
+    CLPixelWiseMultiplication          _pixelwise_mul_cell_state1;
+    CLActivationLayer                  _activation_cell_state;
+    CLActivationLayer                  _cell_clip;
+    CLPixelWiseMultiplication          _pixelwise_mul_cell_state2;
+    CLFullyConnectedLayer              _fully_connected_output;
+    CLPixelWiseMultiplication          _pixelwise_mul_output_state1;
+    CLArithmeticAddition               _accum_output1;
+    CLActivationLayer                  _activation_output;
+    CLActivationLayer                  _activation_output_state;
+    CLPixelWiseMultiplication          _pixelwise_mul_output_state2;
+    CLFullyConnectedLayer              _fully_connected_output_state;
+    CLActivationLayer                  _projection_clip;
+    std::unique_ptr<CLCopyKernel>      _copy_cell_state;
+    std::unique_ptr<CLCopyKernel>      _copy_output;
+    CLConcatenateLayer                 _concat_scratch_buffer;
+    CLConcatenateLayer                 _concat_inputs_forget_gate;
+    CLConcatenateLayer                 _concat_weights_forget_gate;
+    CLConcatenateLayer                 _concat_weights_input_gate;
+    CLConcatenateLayer                 _concat_weights_output;
+    std::unique_ptr<CLMemsetKernel>    _ones_memset_kernel;
+    CLMeanStdDevNormalizationLayer     _mean_std_norm_input_gate;
+    CLPixelWiseMultiplication          _pixelwise_mul_input_gate_coeff;
+    CLArithmeticAddition               _accum_input_gate_bias;
+    CLMeanStdDevNormalizationLayer     _mean_std_norm_forget_gate;
+    CLPixelWiseMultiplication          _pixelwise_mul_forget_gate_coeff;
+    CLArithmeticAddition               _accum_forget_gate_bias;
+    CLMeanStdDevNormalizationLayer     _mean_std_norm_cell_gate;
+    CLPixelWiseMultiplication          _pixelwise_mul_cell_gate_coeff;
+    CLArithmeticAddition               _accum_cell_gate_bias;
+    CLMeanStdDevNormalizationLayer     _mean_std_norm_output_gate;
+    CLPixelWiseMultiplication          _pixelwise_mul_output_gate_coeff;
+    CLArithmeticAddition               _accum_output_gate_bias;
+    CLTensor                           _input_gate_out1;
+    CLTensor                           _input_gate_out2;
+    CLTensor                           _input_gate_out3;
+    CLTensor                           _input_gate_out4;
+    CLTensor                           _forget_gate_out1;
+    CLTensor                           _forget_gate_out2;
+    CLTensor                           _forget_gate_out3;
+    CLTensor                           _forget_gate_out4;
+    CLTensor                           _forget_gate_out5;
+    CLTensor                           _forget_gate_out6;
+    CLTensor                           _cell_state_out1;
+    CLTensor                           _cell_state_out2;
+    CLTensor                           _cell_state_out3;
+    CLTensor                           _cell_state_out4;
+    CLTensor                           _cell_state_out5;
+    CLTensor                           _output1;
+    CLTensor                           _output2;
+    CLTensor                           _output3;
+    CLTensor                           _output4;
+    CLTensor                           _cell_state_activation;
+    CLTensor                           _output_state1;
+    CLTensor                           _ones;
+    CLTensor                           _input_layer_norm_out1;
+    CLTensor                           _input_layer_norm_out2;
+    CLTensor                           _forget_layer_norm_out1;
+    CLTensor                           _forget_layer_norm_out2;
+    CLTensor                           _cell_layer_norm_out1;
+    CLTensor                           _cell_layer_norm_out2;
+    CLTensor                           _output_layer_norm_out1;
+    CLTensor                           _output_layer_norm_out2;
+    bool                               _run_peephole_opt;
+    bool                               _run_cifg_opt;
+    bool                               _perform_cell_clipping;
+    bool                               _has_projection_weights;
+    bool                               _perform_projection_clipping;
+    bool                               _is_prepared;
+    bool                               _is_layer_norm_lstm;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLLSTMLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
index e1a8b25cfb..875b714edd 100644
--- a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
+++ b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
@@ -50,6 +50,9 @@ class ICLTensor;
  *  difference between the two tensors is the corresponding level L(i) of the Laplacian pyramid.
  *  L(i) = I(i) - Gaussian5x5(I(i))
  *  Level 0 has always the same first two dimensions as the input tensor.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
 */
 class CLLaplacianPyramid : public IFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
index 4ccc1a43e2..c780b56dd8 100644
--- a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
+++ b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
@@ -56,6 +56,9 @@ using ICLImage = ICLTensor;
  *  I(i-1) = upsample(I(i) + L(i))
  *
  *  output = I(0) + L(0)
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
 */
 class CLLaplacianReconstruct : public IFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
index 59d0db663c..3bbf9f2c30 100644
--- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
@@ -26,10 +26,6 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
-#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -39,7 +35,13 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLCol2ImKernel;
+class CLIm2ColKernel;
+class CLWeightsReshapeKernel;
+class CLLocallyConnectedMatrixMultiplyKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to compute the locally connected layer. This function calls the following OpenCL kernels:
  *
@@ -72,6 +74,7 @@ class CLLocallyConnectedLayer : public IFunction
      *                       Data types supported: Same as @p input.
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.11)
     void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
     /** Set the input and output tensors.
      *
@@ -85,6 +88,7 @@ class CLLocallyConnectedLayer : public IFunction
      *                             Data types supported: Same as @p input.
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.11)
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLLocallyConnectedLayer
      *
@@ -106,16 +110,16 @@ class CLLocallyConnectedLayer : public IFunction
     void prepare() override;
 
 private:
-    MemoryGroup                            _memory_group;
-    CLIm2ColKernel                         _input_im2col_kernel;
-    CLWeightsReshapeKernel                 _weights_reshape_kernel;
-    CLLocallyConnectedMatrixMultiplyKernel _mm_kernel;
-    CLCol2ImKernel                         _output_col2im_kernel;
-    CLTensor                               _input_im2col_reshaped;
-    CLTensor                               _weights_reshaped;
-    CLTensor                               _gemm_output;
-    bool                                   _is_prepared;
-    const ICLTensor                       *_original_weights;
+    MemoryGroup                                             _memory_group;
+    std::unique_ptr<CLIm2ColKernel>                         _input_im2col_kernel;
+    std::unique_ptr<CLWeightsReshapeKernel>                 _weights_reshape_kernel;
+    std::unique_ptr<CLLocallyConnectedMatrixMultiplyKernel> _mm_kernel;
+    std::unique_ptr<CLCol2ImKernel>                         _output_col2im_kernel;
+    CLTensor                                                _input_im2col_reshaped;
+    CLTensor                                                _weights_reshaped;
+    CLTensor                                                _gemm_output;
+    bool                                                    _is_prepared;
+    const ICLTensor                                        *_original_weights;
 };
 }
 #endif /* ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLLogicalAnd.h b/arm_compute/runtime/CL/functions/CLLogicalAnd.h
new file mode 100644
index 0000000000..1a6ccf35a5
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLogicalAnd.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLLOGICALAND_H
+#define ARM_COMPUTE_CLLOGICALAND_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+
+namespace experimental
+{
+class CLLogicalAnd : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLLogicalAnd() = default;
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: U8.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLLogicalBinaryKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8.
+     * @param[in] input2 Second tensor input info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data types supported: U8.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace experimental
+
+/** Basic function to run @ref CLLogicalBinaryKernel.
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a logical AND operation using the two input tensors.
+ */
+class CLLogicalAnd : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLLogicalAnd();
+    /** Default Destructor */
+    ~CLLogicalAnd();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogicalAnd(const CLLogicalAnd &) = delete;
+    /** Default move constructor */
+    CLLogicalAnd(CLLogicalAnd &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogicalAnd &operator=(const CLLogicalAnd &) = delete;
+    /** Default move assignment operator */
+    CLLogicalAnd &operator=(CLLogicalAnd &&);
+    /** Initialize the function
+     *
+     * @param[in]  input1 Input tensor. Data types supported: U8.
+     * @param[in]  input2 Input tensor. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: U8.
+     */
+    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          Input tensor. Data types supported: U8.
+     * @param[in]  input2          Input tensor. Data types supported: U8.
+     * @param[out] output          Output tensor. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8.
+     * @param[in] input2 Second tensor input info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data types supported: U8.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLLOGICALAND_H */
diff --git a/arm_compute/runtime/CL/functions/CLLogicalNot.h b/arm_compute/runtime/CL/functions/CLLogicalNot.h
new file mode 100644
index 0000000000..4fdf39ed70
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLogicalNot.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLLOGICALNOT_H
+#define ARM_COMPUTE_CLLOGICALNOT_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+
+namespace experimental
+{
+class CLLogicalNot : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLLogicalNot() = default;
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input           Tensor input. Data types supported: U8.
+     * @param[out]     output          Output tensor. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLElementWiseUnaryLayerKernel
+     *
+     * @param[in] input  Tensor input info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data types supported: U8.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace experimental
+
+/** Basic function to do logical NOT operation
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a logical NOT operation on input tensor.
+ */
+class CLLogicalNot : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLLogicalNot();
+    /** Default Destructor */
+    ~CLLogicalNot();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogicalNot(const CLLogicalNot &) = delete;
+    /** Default move constructor */
+    CLLogicalNot(CLLogicalNot &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogicalNot &operator=(const CLLogicalNot &) = delete;
+    /** Default move assignment operator */
+    CLLogicalNot &operator=(CLLogicalNot &&);
+    /** Initialize the function
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data types supported: U8.
+     * @param[out] output          Output tensor. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * @param[in] input  Tensor input info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data types supported: U8.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLLOGICALNOT_H */
diff --git a/arm_compute/runtime/CL/functions/CLLogicalOr.h b/arm_compute/runtime/CL/functions/CLLogicalOr.h
new file mode 100644
index 0000000000..a50551e1dd
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLogicalOr.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLLOGICALOR_H
+#define ARM_COMPUTE_CLLOGICALOR_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+
+namespace experimental
+{
+class CLLogicalOr : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLLogicalOr() = default;
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: U8.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLLogicalBinaryKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8.
+     * @param[in] input2 Second tensor input info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data types supported: U8.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace experimental
+
+/** Basic function to run @ref CLLogicalBinaryKernel.
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a logical OR operation using the two input tensors.
+ */
+class CLLogicalOr : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLLogicalOr();
+    /** Default Destructor */
+    ~CLLogicalOr();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogicalOr(const CLLogicalOr &) = delete;
+    /** Default move constructor */
+    CLLogicalOr(CLLogicalOr &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogicalOr &operator=(const CLLogicalOr &) = delete;
+    /** Default move assignment operator */
+    CLLogicalOr &operator=(CLLogicalOr &&);
+    /** Initialize the function
+     *
+     * @param[in]  input1 Input tensor. Data types supported: U8.
+     * @param[in]  input2 Input tensor. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: U8.
+     */
+    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          Input tensor. Data types supported: U8.
+     * @param[in]  input2          Input tensor. Data types supported: U8.
+     * @param[out] output          Output tensor. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8.
+     * @param[in] input2 Second tensor input info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data types supported: U8.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLLOGICALOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLMagnitude.h b/arm_compute/runtime/CL/functions/CLMagnitude.h
index ad7cc778e5..4ed1414613 100644
--- a/arm_compute/runtime/CL/functions/CLMagnitude.h
+++ b/arm_compute/runtime/CL/functions/CLMagnitude.h
@@ -29,9 +29,14 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
-/** Basic function to run @ref CLMagnitudePhaseKernel. */
+/** Basic function to run @ref CLMagnitudePhaseKernel.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class CLMagnitude : public ICLSimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
index 5c8548f9e0..693862fb89 100644
--- a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
@@ -24,14 +24,19 @@
 #ifndef ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H
 #define ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+#include <memory>
 
 namespace arm_compute
 {
-class ITensor;
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+class CLMaxUnpoolingLayerKernel;
+class CLMemsetKernel;
+struct PoolingLayerInfo;
 
 /** Function to perform MaxUnpooling. This function calls the following OpenCL kernels:
  *
@@ -43,6 +48,12 @@ class CLMaxUnpoolingLayer : public IFunction
 public:
     /** Constructor */
     CLMaxUnpoolingLayer();
+    /** Prevent instances of this class from being copied */
+    CLMaxUnpoolingLayer(const CLMaxUnpoolingLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLMaxUnpoolingLayer &operator=(const CLMaxUnpoolingLayer &) = delete;
+    /** Default destructor */
+    ~CLMaxUnpoolingLayer();
     /** Set the input and output tensors.
      *
      * @note Output shape must be equal to the shape of the original input to pool.
@@ -88,8 +99,8 @@ class CLMaxUnpoolingLayer : public IFunction
     void run() override;
 
 private:
-    CLMemsetKernel            _memset_kernel;
-    CLMaxUnpoolingLayerKernel _unpooling_layer_kernel;
+    std::unique_ptr<CLMemsetKernel>            _memset_kernel;
+    std::unique_ptr<CLMaxUnpoolingLayerKernel> _unpooling_layer_kernel;
 };
 }
 #endif /* ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDev.h b/arm_compute/runtime/CL/functions/CLMeanStdDev.h
index be192a7c11..d9ced1393e 100644
--- a/arm_compute/runtime/CL/functions/CLMeanStdDev.h
+++ b/arm_compute/runtime/CL/functions/CLMeanStdDev.h
@@ -25,15 +25,20 @@
 #define ARM_COMPUTE_CLMEANSTDDEV_H
 
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+class CLFillBorderKernel;
+class CLMeanStdDevKernel;
 /** Basic function to execute mean and standard deviation by calling @ref CLMeanStdDevKernel */
 class CLMeanStdDev : public IFunction
 {
@@ -49,7 +54,7 @@ class CLMeanStdDev : public IFunction
     /** Allow instances of this class to be moved */
     CLMeanStdDev &operator=(CLMeanStdDev &&) = default;
     /** Default destructor */
-    ~CLMeanStdDev() = default;
+    ~CLMeanStdDev();
     /** Initialise the kernel's inputs and outputs.
      *
      * @param[in, out] input  Input image. Data types supported: U8/F16/F32. (Written to only for border filling)
@@ -83,20 +88,20 @@ class CLMeanStdDev : public IFunction
     void run_float();
     void run_int();
 
-    MemoryGroup          _memory_group;               /**< Function's memory group */
-    DataType             _data_type;                  /**< Input data type. */
-    unsigned int         _num_pixels;                 /**< Number of image's pixels. */
-    bool                 _run_stddev;                 /**< Flag for knowing if we should run stddev reduction function. */
-    CLReductionOperation _reduction_operation_mean;   /**< Reduction operation function for computing mean value. */
-    CLReductionOperation _reduction_operation_stddev; /**< Reduction operation function for computing standard deviation. */
-    CLTensor             _reduction_output_mean;      /**< Reduction operation output tensor for mean value. */
-    CLTensor             _reduction_output_stddev;    /**< Reduction operation output tensor for standard deviation value. */
-    float               *_mean;                       /**< Pointer that holds the mean value. */
-    float               *_stddev;                     /**< Pointer that holds the standard deviation value. */
-    CLMeanStdDevKernel   _mean_stddev_kernel;         /**< Kernel that standard deviation calculation. */
-    CLFillBorderKernel   _fill_border_kernel;         /**< Kernel that fills the border with zeroes. */
-    cl::Buffer           _global_sum;                 /**< Variable that holds the global sum among calls in order to ease reduction */
-    cl::Buffer           _global_sum_squared;         /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
+    MemoryGroup                         _memory_group;               /**< Function's memory group */
+    DataType                            _data_type;                  /**< Input data type. */
+    unsigned int                        _num_pixels;                 /**< Number of image's pixels. */
+    bool                                _run_stddev;                 /**< Flag for knowing if we should run stddev reduction function. */
+    CLReductionOperation                _reduction_operation_mean;   /**< Reduction operation function for computing mean value. */
+    CLReductionOperation                _reduction_operation_stddev; /**< Reduction operation function for computing standard deviation. */
+    CLTensor                            _reduction_output_mean;      /**< Reduction operation output tensor for mean value. */
+    CLTensor                            _reduction_output_stddev;    /**< Reduction operation output tensor for standard deviation value. */
+    float                              *_mean;                       /**< Pointer that holds the mean value. */
+    float                              *_stddev;                     /**< Pointer that holds the standard deviation value. */
+    std::unique_ptr<CLMeanStdDevKernel> _mean_stddev_kernel;         /**< Kernel that standard deviation calculation. */
+    std::unique_ptr<CLFillBorderKernel> _fill_border_kernel;         /**< Kernel that fills the border with zeroes. */
+    cl::Buffer                          _global_sum;                 /**< Variable that holds the global sum among calls in order to ease reduction */
+    cl::Buffer                          _global_sum_squared;         /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
 };
 }
 #endif /*ARM_COMPUTE_CLMEANSTDDEV_H */
diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
index 1627de1ae8..cfe59eac09 100644
--- a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
@@ -29,7 +29,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to execute mean and standard deviation normalization by calling @ref CLMeanStdDevNormalizationKernel */
 class CLMeanStdDevNormalizationLayer : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLMedian3x3.h b/arm_compute/runtime/CL/functions/CLMedian3x3.h
index 7f67f958c1..1fe318e851 100644
--- a/arm_compute/runtime/CL/functions/CLMedian3x3.h
+++ b/arm_compute/runtime/CL/functions/CLMedian3x3.h
@@ -31,6 +31,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to execute median filter. This function calls the following OpenCL kernels:
@@ -38,6 +39,8 @@ class ICLTensor;
  * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref CLMedian3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLMedian3x3 : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
index 04926f7bd0..77c381f64d 100644
--- a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
+++ b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
@@ -24,12 +24,16 @@
 #ifndef ARM_COMPUTE_CLMINMAXLOCATION_H
 #define ARM_COMPUTE_CLMINMAXLOCATION_H
 
-#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
 #include "arm_compute/runtime/CL/CLArray.h"
 #include "arm_compute/runtime/IFunction.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
+class CLMinMaxKernel;
+class CLMinMaxLocationKernel;
 class ICLTensor;
 using ICLImage = ICLTensor;
 
@@ -37,6 +41,9 @@ using ICLImage = ICLTensor;
  *
  * -# @ref CLMinMaxKernel
  * -# @ref CLMinMaxLocationKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLMinMaxLocation : public IFunction
 {
@@ -51,6 +58,8 @@ class CLMinMaxLocation : public IFunction
     CLMinMaxLocation(CLMinMaxLocation &&) = default;
     /** Allow instances of this class to be moved */
     CLMinMaxLocation &operator=(CLMinMaxLocation &&) = default;
+    /** Default destructor */
+    ~CLMinMaxLocation();
     /** Initialise the kernel's inputs and outputs.
      *
      * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
@@ -87,16 +96,16 @@ class CLMinMaxLocation : public IFunction
     void run() override;
 
 private:
-    CLMinMaxKernel         _min_max_kernel;     /**< Kernel that performs min/max */
-    CLMinMaxLocationKernel _min_max_loc_kernel; /**< Kernel that counts min/max occurrences and identifies their positions */
-    cl::Buffer             _min_max_vals;       /**< Buffer to collect min, max values */
-    cl::Buffer             _min_max_count_vals; /**< Buffer to collect min, max values */
-    void                  *_min;                /**< Minimum value. */
-    void                  *_max;                /**< Maximum value. */
-    uint32_t              *_min_count;          /**< Minimum value occurrences. */
-    uint32_t              *_max_count;          /**< Maximum value occurrences. */
-    CLCoordinates2DArray *_min_loc;             /**< Minimum value occurrences coordinates. */
-    CLCoordinates2DArray *_max_loc;             /**< Maximum value occurrences  coordinates. */
+    std::unique_ptr<CLMinMaxKernel>         _min_max_kernel;     /**< Kernel that performs min/max */
+    std::unique_ptr<CLMinMaxLocationKernel> _min_max_loc_kernel; /**< Kernel that counts min/max occurrences and identifies their positions */
+    cl::Buffer                              _min_max_vals;       /**< Buffer to collect min, max values */
+    cl::Buffer                              _min_max_count_vals; /**< Buffer to collect min, max values */
+    void                                   *_min;                /**< Minimum value. */
+    void                                   *_max;                /**< Maximum value. */
+    uint32_t                               *_min_count;          /**< Minimum value occurrences. */
+    uint32_t                               *_max_count;          /**< Maximum value occurrences. */
+    CLCoordinates2DArray                   *_min_loc;            /**< Minimum value occurrences coordinates. */
+    CLCoordinates2DArray                   *_max_loc;            /**< Maximum value occurrences  coordinates. */
 };
 }
 #endif /*ARM_COMPUTE_CLMINMAXLOCATION_H */
diff --git a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
index 8b7e350e09..3d0947db05 100644
--- a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
+++ b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
@@ -31,6 +31,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to execute non linear filter. This function calls the following OpenCL kernels:
@@ -39,6 +40,9 @@ class ICLTensor;
  * -# @ref CLNonLinearFilterKernel
  *
  * @note Supported mask dimensions squares of sizes 3, 5
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLNonLinearFilter : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
index 556de1c64c..60dad42814 100644
--- a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
+++ b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
@@ -29,12 +29,16 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to execute non-maxima suppression over a 3x3 window. This function calls the following CL kernels:
  *
  * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref CLNonMaximaSuppression3x3Kernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLNonMaximaSuppression3x3 : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
index a2d46b368f..389b21e5c8 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
@@ -24,18 +24,19 @@
 #ifndef ARM_COMPUTE_CLNORMALIZATIONLAYER_H
 #define ARM_COMPUTE_CLNORMALIZATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLFillBorderKernel;
+class CLNormalizationLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to compute a normalization layer. This function calls the following CL kernels:
  *
@@ -48,6 +49,16 @@ class CLNormalizationLayer : public IFunction
 public:
     /** Default constructor */
     CLNormalizationLayer();
+    /** Prevent instances of this class from being copied */
+    CLNormalizationLayer(const CLNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLNormalizationLayer &operator=(const CLNormalizationLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLNormalizationLayer(CLNormalizationLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLNormalizationLayer &operator=(CLNormalizationLayer &&) = delete;
+    /** Default destructor */
+    ~CLNormalizationLayer();
     /** Set the input and output tensors.
      *
      * @param[in, out] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -85,8 +96,8 @@ class CLNormalizationLayer : public IFunction
     void run() override;
 
 private:
-    CLNormalizationLayerKernel _norm_kernel;    /**< Normalization layer kernel to run */
-    CLFillBorderKernel         _border_handler; /**< Kernel to handle  borders */
+    std::unique_ptr<CLNormalizationLayerKernel> _norm_kernel;    /**< Normalization layer kernel to run */
+    std::unique_ptr<CLFillBorderKernel>         _border_handler; /**< Kernel to handle  borders */
 };
 }
 #endif /* ARM_COMPUTE_CLNORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
index cf4a9b6497..de5155c65a 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
@@ -31,7 +31,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLNormalizePlanarYUVLayerKernel
  *
diff --git a/arm_compute/runtime/CL/functions/CLOpticalFlow.h b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
index adce6748c8..5c555f5709 100644
--- a/arm_compute/runtime/CL/functions/CLOpticalFlow.h
+++ b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLOPTICALFLOW_H
 #define ARM_COMPUTE_CLOPTICALFLOW_H
 
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
-
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLArray.h"
@@ -41,7 +39,12 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class CLPyramid;
+class CLLKTrackerInitKernel;
+class CLLKTrackerStage0Kernel;
+class CLLKTrackerStage1Kernel;
+class CLLKTrackerFinalizeKernel;
 
 /** OpenCL Array of Internal Keypoints */
 using CLLKInternalKeypointArray = CLArray<CLLKInternalKeypoint>;
@@ -57,6 +60,9 @@ using CLOldValueArray = CLArray<CLOldValue>;
  * -# @ref CLLKTrackerStage0Kernel
  * -# @ref CLLKTrackerStage1Kernel
  * -# @ref CLLKTrackerFinalizeKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLOpticalFlow : public IFunction
 {
@@ -71,6 +77,8 @@ class CLOpticalFlow : public IFunction
     CLOpticalFlow(CLOpticalFlow &&) = default;
     /** Allow instances of this class to be moved */
     CLOpticalFlow &operator=(CLOpticalFlow &&) = default;
+    /** Default destructor */
+    ~CLOpticalFlow();
     /**  Initialise the function input and output
      *
      * @param[in]  old_pyramid           Pointer to the pyramid for the old tensor. Data types supported U8
@@ -117,22 +125,22 @@ class CLOpticalFlow : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                                _memory_group;
-    std::vector<CLLKTrackerInitKernel>         _tracker_init_kernel;
-    std::vector<CLLKTrackerStage0Kernel>       _tracker_stage0_kernel;
-    std::vector<CLLKTrackerStage1Kernel>       _tracker_stage1_kernel;
-    CLLKTrackerFinalizeKernel                  _tracker_finalize_kernel;
-    std::vector<CLScharr3x3>                   _func_scharr;
-    std::vector<CLTensor>                      _scharr_gx;
-    std::vector<CLTensor>                      _scharr_gy;
-    const ICLKeyPointArray                    *_old_points;
-    const ICLKeyPointArray                    *_new_points_estimates;
-    ICLKeyPointArray                          *_new_points;
-    std::unique_ptr<CLLKInternalKeypointArray> _old_points_internal;
-    std::unique_ptr<CLLKInternalKeypointArray> _new_points_internal;
-    std::unique_ptr<CLCoefficientTableArray>   _coefficient_table;
-    std::unique_ptr<CLOldValueArray>           _old_values;
-    size_t                                     _num_levels;
+    MemoryGroup                                           _memory_group;
+    std::vector<std::unique_ptr<CLLKTrackerInitKernel>>   _tracker_init_kernel;
+    std::vector<std::unique_ptr<CLLKTrackerStage0Kernel>> _tracker_stage0_kernel;
+    std::vector<std::unique_ptr<CLLKTrackerStage1Kernel>> _tracker_stage1_kernel;
+    std::unique_ptr<CLLKTrackerFinalizeKernel>            _tracker_finalize_kernel;
+    std::vector<CLScharr3x3>                              _func_scharr;
+    std::vector<CLTensor>                                 _scharr_gx;
+    std::vector<CLTensor>                                 _scharr_gy;
+    const ICLKeyPointArray                               *_old_points;
+    const ICLKeyPointArray                               *_new_points_estimates;
+    ICLKeyPointArray                                     *_new_points;
+    std::unique_ptr<CLLKInternalKeypointArray>            _old_points_internal;
+    std::unique_ptr<CLLKInternalKeypointArray>            _new_points_internal;
+    std::unique_ptr<CLCoefficientTableArray>              _coefficient_table;
+    std::unique_ptr<CLOldValueArray>                      _old_values;
+    size_t                                                _num_levels;
 };
 }
 #endif /*ARM_COMPUTE_CLOPTICALFLOW_H */
diff --git a/arm_compute/runtime/CL/functions/CLPReluLayer.h b/arm_compute/runtime/CL/functions/CLPReluLayer.h
index 84743508df..ab32bccc24 100644
--- a/arm_compute/runtime/CL/functions/CLPReluLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPReluLayer.h
@@ -24,13 +24,14 @@
 #ifndef ARM_COMPUTE_CLPRELULAYER_H
 #define ARM_COMPUTE_CLPRELULAYER_H
 
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 namespace experimental
 {
@@ -65,9 +66,6 @@ class CLPReluLayer : public ICLOperator
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
-
-private:
-    CLFillBorderKernel _border_handler;
 };
 } // namespace experimental
 
diff --git a/arm_compute/runtime/CL/functions/CLPadLayer.h b/arm_compute/runtime/CL/functions/CLPadLayer.h
index e3a923f81c..2bbde30fc2 100644
--- a/arm_compute/runtime/CL/functions/CLPadLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPadLayer.h
@@ -24,13 +24,15 @@
 #ifndef ARM_COMPUTE_CLPADLAYER_H
 #define ARM_COMPUTE_CLPADLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLPadLayerKernel;
+class CLCopyKernel;
 class ICLTensor;
 
 /** Basic function to pad a tensor. This function calls the following OpenCL functions/kernels:
@@ -51,6 +53,8 @@ class CLPadLayer : public IFunction
     CLPadLayer &operator=(const CLPadLayer &) = delete;
     /** Default move assignment operator */
     CLPadLayer &operator=(CLPadLayer &&) = default;
+    /** Default destructor */
+    ~CLPadLayer();
 
     /** Initialize the function
      *
@@ -95,9 +99,9 @@ class CLPadLayer : public IFunction
 private:
     void configure_reflect_mode(ICLTensor *input, ICLTensor *output);
 
-    CLPadLayerKernel _pad_kernel;
-    CLCopyKernel     _copy_kernel;
-    bool             _perform_pad;
+    std::unique_ptr<CLPadLayerKernel> _pad_kernel;
+    std::unique_ptr<CLCopyKernel>     _copy_kernel;
+    bool                              _perform_pad;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_PADLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLPermute.h b/arm_compute/runtime/CL/functions/CLPermute.h
index abc23eff0c..50e81da7c4 100644
--- a/arm_compute/runtime/CL/functions/CLPermute.h
+++ b/arm_compute/runtime/CL/functions/CLPermute.h
@@ -31,7 +31,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to execute an @ref CLPermuteKernel. */
 class CLPermute : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLPhase.h b/arm_compute/runtime/CL/functions/CLPhase.h
index 2731a08a52..7c76c234fe 100644
--- a/arm_compute/runtime/CL/functions/CLPhase.h
+++ b/arm_compute/runtime/CL/functions/CLPhase.h
@@ -29,9 +29,14 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
-/** Basic function to execute an @ref CLMagnitudePhaseKernel. */
+/** Basic function to execute an @ref CLMagnitudePhaseKernel.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class CLPhase : public ICLSimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
index 2066012306..6432cd040d 100644
--- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
+++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
@@ -24,14 +24,16 @@
 #ifndef ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H
 #define ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H
 
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 // Forward declaration
+class CLCompileContext;
+class CLFillBorderKernel;
 class ICLTensor;
+class ITensorInfo;
 
 namespace experimental
 {
@@ -106,7 +108,7 @@ class CLPixelWiseMultiplication : public ICLOperator
     void run(ITensorPack &tensors) override;
 
 private:
-    CLFillBorderKernel _border_handler;
+    std::unique_ptr<CLFillBorderKernel> _border_handler;
 };
 
 /** Basic function to run @ref CLComplexPixelWiseMultiplicationKernel. */
@@ -139,7 +141,7 @@ class CLComplexPixelWiseMultiplication : public ICLOperator
     void run(ITensorPack &tensors) override;
 
 private:
-    CLFillBorderKernel _border_handler;
+    std::unique_ptr<CLFillBorderKernel> _border_handler;
 };
 } // namespace experimental
 
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index 96dacf9322..ef1f426c22 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -31,7 +31,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
  *
diff --git a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
index 9a78e77307..9129bfd064 100644
--- a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
@@ -24,13 +24,16 @@
 #ifndef ARM_COMPUTE_CLPRIORBOXLAYER_H
 #define ARM_COMPUTE_CLPRIORBOXLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
+#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLPriorBoxLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLPriorBoxLayerKernel. */
 class CLPriorBoxLayer : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
index 53f337bc61..a8f9221b3d 100644
--- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
@@ -24,9 +24,6 @@
 #ifndef ARM_COMPUTE_CLQLSTMLAYER_H
 #define ARM_COMPUTE_CLQLSTMLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
@@ -40,7 +37,12 @@
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
+class CLCopyKernel;
 class ICLTensor;
+class CLGEMMLowpMatrixAReductionKernel;
+class CLQLSTMLayerNormalizationKernel;
+class ITensorInfo;
 
 /** Basic function to run @ref CLQLSTMLayer
  *
@@ -68,6 +70,8 @@ class CLQLSTMLayer : public IFunction
     CLQLSTMLayer &operator=(const CLQLSTMLayer &) = delete;
     /** Default move assignment operator */
     CLQLSTMLayer &operator=(CLQLSTMLayer &&) = default;
+    /** Default destructor */
+    ~CLQLSTMLayer();
     /** Initialize function's tensors.
      *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED.
@@ -113,7 +117,7 @@ class CLQLSTMLayer : public IFunction
                    const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                    const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                    const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                   ICLTensor *cell_state_in, ICLTensor *output_state_in,
                    ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
                    const LSTMParams<ICLTensor> &lstm_params);
 
@@ -163,7 +167,7 @@ class CLQLSTMLayer : public IFunction
                    const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                    const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                    const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                   ICLTensor *cell_state_in, ICLTensor *output_state_in,
                    ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
                    const LSTMParams<ICLTensor> &lstm_params);
 
@@ -285,72 +289,72 @@ class CLQLSTMLayer : public IFunction
     };
 
     // Functions used
-    CLTranspose                      _transpose_input_to_forget_weights{};
-    CLTranspose                      _transpose_input_to_cell_weights{};
-    CLTranspose                      _transpose_input_to_output_weights{};
-    CLTranspose                      _transpose_input_to_input_weights{};
-    CLTranspose                      _transpose_recurrent_to_forget_weights{};
-    CLTranspose                      _transpose_recurrent_to_cell_weights{};
-    CLTranspose                      _transpose_recurrent_to_output_weights{};
-    CLTranspose                      _transpose_recurrent_to_input_weights{};
-    CLTranspose                      _transpose_projection_weights{};
-    CLGEMMLowpMatrixAReductionKernel _input_to_input_reduction{};
-    CLGEMMLowpMatrixAReductionKernel _recurrent_to_input_reduction{};
-    CLGEMMLowpMatrixAReductionKernel _input_to_forget_reduction{};
-    CLGEMMLowpMatrixAReductionKernel _recurrent_to_forget_reduction{};
-    CLGEMMLowpMatrixAReductionKernel _input_to_cell_reduction{};
-    CLGEMMLowpMatrixAReductionKernel _recurrent_to_cell_reduction{};
-    CLGEMMLowpMatrixAReductionKernel _input_to_output_reduction{};
-    CLGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{};
-    CLGEMMLowpMatrixAReductionKernel _projection_reduction{};
-    CLArithmeticAddition             _projection_bias_add{};
-    CLGEMMLowpMatrixMultiplyCore     _mm_input_to_forget{};
-    CLGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_forget{};
-    CLPixelWiseMultiplication        _pixelwise_mul_cell_to_forget{};
-    CLGEMMLowpOutputStage            _input_to_forget_outstage{};
-    CLGEMMLowpOutputStage            _recurrent_to_forget_outstage{};
-    CLGEMMLowpOutputStage            _cell_to_forget_outstage{};
-    CLArithmeticAddition             _accumulate_input_recurrent_forget{};
-    CLArithmeticAddition             _accumulate_cell_forget{};
-    CLActivationLayer                _forget_gate_sigmoid{};
-    CLGEMMLowpMatrixMultiplyCore     _mm_input_to_cell{};
-    CLGEMMLowpOutputStage            _input_to_cell_outstage{};
-    CLGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_cell{};
-    CLGEMMLowpOutputStage            _recurrent_to_cell_outstage{};
-    CLArithmeticAddition             _accumulate_input_recurrent_modulation{};
-    CLActivationLayer                _cell_gate_tanh{};
-    CLArithmeticSubtraction          _input_gate_sub{};
-    CLGEMMLowpMatrixMultiplyCore     _mm_input_to_input{};
-    CLGEMMLowpOutputStage            _input_to_input_outstage{};
-    CLGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_input{};
-    CLGEMMLowpOutputStage            _recurrent_to_input_outstage{};
-    CLArithmeticAddition             _accumulate_input_recurrent_input{};
-    CLPixelWiseMultiplication        _pixelwise_mul_cell_to_input{};
-    CLGEMMLowpOutputStage            _cell_to_input_outstage{};
-    CLArithmeticAddition             _accumulate_cell_input{};
-    CLActivationLayer                _input_gate_sigmoid{};
-    CLPixelWiseMultiplication        _pixelwise_mul_forget_cell{};
-    CLPixelWiseMultiplication        _pixelwise_mul_input_cell{};
-    CLArithmeticAddition             _add_forget_cell{};
-    CLActivationLayer                _cell_clip{};
-    CLGEMMLowpMatrixMultiplyCore     _mm_input_to_output{};
-    CLGEMMLowpOutputStage            _input_to_output_outstage{};
-    CLGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_output{};
-    CLGEMMLowpOutputStage            _recurrent_to_output_outstage{};
-    CLArithmeticAddition             _accumulate_input_recurrent_output{};
-    CLPixelWiseMultiplication        _pixelwise_mul_cell_to_output{};
-    CLGEMMLowpOutputStage            _cell_to_output_outstage{};
-    CLArithmeticAddition             _accumulate_cell_to_output{};
-    CLActivationLayer                _output_gate_sigmoid{};
-    CLActivationLayer                _hidden_tanh{};
-    CLPixelWiseMultiplication        _pixelwise_mul_hidden{};
-    CLGEMMLowpOutputStage            _hidden_outstage{};
-    CLGEMMLowpMatrixMultiplyCore     _mm_projection{};
-    CLGEMMLowpOutputStage            _projection_outstage{};
-    CLArithmeticAddition             _accumulate_projection{};
-    CLActivationLayer                _projection_clip{};
-    std::array<CLQLSTMLayerNormalizationKernel, _layer_norm_count> _layer_norms{ {} };
-    CLCopyKernel _copy_output{};
+    CLTranspose                                       _transpose_input_to_forget_weights{};
+    CLTranspose                                       _transpose_input_to_cell_weights{};
+    CLTranspose                                       _transpose_input_to_output_weights{};
+    CLTranspose                                       _transpose_input_to_input_weights{};
+    CLTranspose                                       _transpose_recurrent_to_forget_weights{};
+    CLTranspose                                       _transpose_recurrent_to_cell_weights{};
+    CLTranspose                                       _transpose_recurrent_to_output_weights{};
+    CLTranspose                                       _transpose_recurrent_to_input_weights{};
+    CLTranspose                                       _transpose_projection_weights{};
+    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_input_reduction;
+    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
+    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction;
+    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
+    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction;
+    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
+    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _input_to_output_reduction;
+    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
+    std::unique_ptr<CLGEMMLowpMatrixAReductionKernel> _projection_reduction;
+    CLArithmeticAddition                              _projection_bias_add{};
+    CLGEMMLowpMatrixMultiplyCore                      _mm_input_to_forget{};
+    CLGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_forget{};
+    CLPixelWiseMultiplication                         _pixelwise_mul_cell_to_forget{};
+    CLGEMMLowpOutputStage                             _input_to_forget_outstage{};
+    CLGEMMLowpOutputStage                             _recurrent_to_forget_outstage{};
+    CLGEMMLowpOutputStage                             _cell_to_forget_outstage{};
+    CLArithmeticAddition                              _accumulate_input_recurrent_forget{};
+    CLArithmeticAddition                              _accumulate_cell_forget{};
+    CLActivationLayer                                 _forget_gate_sigmoid{};
+    CLGEMMLowpMatrixMultiplyCore                      _mm_input_to_cell{};
+    CLGEMMLowpOutputStage                             _input_to_cell_outstage{};
+    CLGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_cell{};
+    CLGEMMLowpOutputStage                             _recurrent_to_cell_outstage{};
+    CLArithmeticAddition                              _accumulate_input_recurrent_modulation{};
+    CLActivationLayer                                 _cell_gate_tanh{};
+    CLArithmeticSubtraction                           _input_gate_sub{};
+    CLGEMMLowpMatrixMultiplyCore                      _mm_input_to_input{};
+    CLGEMMLowpOutputStage                             _input_to_input_outstage{};
+    CLGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_input{};
+    CLGEMMLowpOutputStage                             _recurrent_to_input_outstage{};
+    CLArithmeticAddition                              _accumulate_input_recurrent_input{};
+    CLPixelWiseMultiplication                         _pixelwise_mul_cell_to_input{};
+    CLGEMMLowpOutputStage                             _cell_to_input_outstage{};
+    CLArithmeticAddition                              _accumulate_cell_input{};
+    CLActivationLayer                                 _input_gate_sigmoid{};
+    CLPixelWiseMultiplication                         _pixelwise_mul_forget_cell{};
+    CLPixelWiseMultiplication                         _pixelwise_mul_input_cell{};
+    CLArithmeticAddition                              _add_forget_cell{};
+    CLActivationLayer                                 _cell_clip{};
+    CLGEMMLowpMatrixMultiplyCore                      _mm_input_to_output{};
+    CLGEMMLowpOutputStage                             _input_to_output_outstage{};
+    CLGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_output{};
+    CLGEMMLowpOutputStage                             _recurrent_to_output_outstage{};
+    CLArithmeticAddition                              _accumulate_input_recurrent_output{};
+    CLPixelWiseMultiplication                         _pixelwise_mul_cell_to_output{};
+    CLGEMMLowpOutputStage                             _cell_to_output_outstage{};
+    CLArithmeticAddition                              _accumulate_cell_to_output{};
+    CLActivationLayer                                 _output_gate_sigmoid{};
+    CLActivationLayer                                 _hidden_tanh{};
+    CLPixelWiseMultiplication                         _pixelwise_mul_hidden{};
+    CLGEMMLowpOutputStage                             _hidden_outstage{};
+    CLGEMMLowpMatrixMultiplyCore                      _mm_projection{};
+    CLGEMMLowpOutputStage                             _projection_outstage{};
+    CLArithmeticAddition                              _accumulate_projection{};
+    CLActivationLayer                                 _projection_clip{};
+    std::array<std::unique_ptr<CLQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
+    std::unique_ptr<CLCopyKernel> _copy_output;
 
     TensorCopyKernel _projection_bias_copy{};
     TensorCopyKernel _projection_output_to_accumulate_copy{};
@@ -402,30 +406,11 @@ class CLQLSTMLayer : public IFunction
 
     inline CLQLSTMLayerNormalizationKernel &get_layer_norm(LayerNormGate g)
     {
-        return _layer_norms[getGateIndex(g)];
+        return *_layer_norms[getGateIndex(g)];
     }
 
-    inline void configure_layer_norm(LayerNormGate g, const ICLTensor *in)
-    {
-        ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
-
-        CLTensor *out = &get_layer_norm_output(g);
-        _memory_group.manage(out);
-        out->allocator()->init(*(in->info()));
-
-        get_layer_norm(g).configure(in, out, get_layer_norm_weight(g), get_layer_norm_bias(g));
-    }
-
-    inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
-    {
-        // Output quantization scale will be different, but ignored here
-        // since it will be configured at configure() stage.
-        const TensorInfo out
-        {
-            in
-        };
-        return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
-    }
+    inline void configure_layer_norm(LayerNormGate g, const ICLTensor *in);
+    inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
 
     // Temporary tensors
     CLTensor _input_to_forget_weights_transposed{ nullptr };
diff --git a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
index e045adf5fd..a0a27c5cb4 100644
--- a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
@@ -24,11 +24,14 @@
 #ifndef ARM_COMPUTE_CLQUANTIZATIONLAYER_H
 #define ARM_COMPUTE_CLQUANTIZATIONLAYER_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to simulate a quantization layer. This function calls the following CL kernels:
  *
diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h
index 9d1cb1a724..ff3fb5449b 100644
--- a/arm_compute/runtime/CL/functions/CLRNNLayer.h
+++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h
@@ -24,15 +24,17 @@
 #ifndef ARM_COMPUTE_CLRNN_LAYER_H
 #define ARM_COMPUTE_CLRNN_LAYER_H
 
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCopyKernel;
 class ICLTensor;
 
 /** Basic function to run @ref CLRNNLayer */
@@ -41,6 +43,12 @@ class CLRNNLayer : public IFunction
 public:
     /** Default constructor */
     CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLRNNLayer(const CLRNNLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLRNNLayer &operator=(const CLRNNLayer &) = delete;
+    /** Default destructor */
+    ~CLRNNLayer();
     /** Initialize the function
      *
      * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
@@ -85,16 +93,16 @@ class CLRNNLayer : public IFunction
     void prepare() override;
 
 private:
-    MemoryGroup           _memory_group;
-    CLGEMM                _gemm_state_f;
-    CLArithmeticAddition  _add_kernel;
-    CLActivationLayer     _activation;
-    CLFullyConnectedLayer _fully_connected_kernel;
-    CLCopyKernel          _copy_kernel;
-    CLTensor              _fully_connected_out;
-    CLTensor              _gemm_output;
-    CLTensor              _add_output;
-    bool                  _is_prepared;
+    MemoryGroup                   _memory_group;
+    CLGEMM                        _gemm_state_f;
+    CLArithmeticAddition          _add_kernel;
+    CLActivationLayer             _activation;
+    CLFullyConnectedLayer         _fully_connected_kernel;
+    std::unique_ptr<CLCopyKernel> _copy_kernel;
+    CLTensor                      _fully_connected_out;
+    CLTensor                      _gemm_output;
+    CLTensor                      _add_output;
+    bool                          _is_prepared;
 };
 }
 #endif /* ARM_COMPUTE_CLRNN_LAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
index 2e78f16d6b..b4cd5560ef 100644
--- a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
@@ -25,12 +25,14 @@
 #define ARM_COMPUTE_CLROIALIGNLAYER_H
 
 #include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ROIPoolingLayerInfo;
+class ITensorInfo;
 
 /** Basic function to run @ref CLROIAlignLayerKernel.
  *
diff --git a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
index 30139274be..836575ef68 100644
--- a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
@@ -24,14 +24,14 @@
 #ifndef ARM_COMPUTE_CLROIPOOLINGLAYER_H
 #define ARM_COMPUTE_CLROIPOOLINGLAYER_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
 #include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ROIPoolingLayerInfo;
 
 /** Basic function to run @ref CLROIPoolingLayerKernel.
  *
diff --git a/arm_compute/runtime/CL/functions/CLRange.h b/arm_compute/runtime/CL/functions/CLRange.h
index a86cfb605d..e11e740861 100644
--- a/arm_compute/runtime/CL/functions/CLRange.h
+++ b/arm_compute/runtime/CL/functions/CLRange.h
@@ -29,7 +29,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLRangeKernel
  *
diff --git a/arm_compute/runtime/CL/functions/CLReduceMean.h b/arm_compute/runtime/CL/functions/CLReduceMean.h
index 88ead9d2ea..c37ee8c5ab 100644
--- a/arm_compute/runtime/CL/functions/CLReduceMean.h
+++ b/arm_compute/runtime/CL/functions/CLReduceMean.h
@@ -25,7 +25,9 @@
 #define ARM_COMPUTE_CL_REDUCE_MEAN_H
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
+#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -82,8 +84,13 @@ class CLReduceMean : public IFunction
     std::vector<CLReductionOperation> _reduction_kernels;
     std::vector<CLTensor>             _reduced_outs;
     CLReshapeLayer                    _reshape;
+    CLDequantizationLayer             _dequant;
+    CLQuantizationLayer               _requant;
     int                               _reduction_ops;
     bool                              _keep_dims;
+    bool                              _do_requant;
+    CLTensor                          _input_no_quant;
+    CLTensor                          _output_no_quant;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_REDUCE_MEAN_H */
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 5d050d71d6..3fbcee6c21 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLREDUCTIONOPERATION_H
 #define ARM_COMPUTE_CLREDUCTIONOPERATION_H
 
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -37,6 +35,9 @@
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
+class CLFillBorderKernel;
+class CLReductionOperationKernel;
 class ICLTensor;
 
 /** Perform reduction operation.
@@ -49,6 +50,16 @@ class CLReductionOperation : public IFunction
      * @param[in] memory_manager (Optional) Memory manager.
      */
     CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Default Destructor */
+    ~CLReductionOperation();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReductionOperation(const CLReductionOperation &) = delete;
+    /** Default move constructor */
+    CLReductionOperation(CLReductionOperation &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReductionOperation &operator=(const CLReductionOperation &) = delete;
+    /** Default move assignment operator */
+    CLReductionOperation &operator=(CLReductionOperation &&) = default;
 
     /** Set the input and output tensors.
      *
@@ -88,15 +99,15 @@ class CLReductionOperation : public IFunction
 private:
     ICLTensor *configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output);
 
-    MemoryGroup                             _memory_group;
-    std::vector<CLTensor>                   _results_vector;
-    std::vector<CLReductionOperationKernel> _reduction_kernels_vector;
-    std::vector<CLFillBorderKernel>         _border_handlers_vector;
-    CLReshapeLayer                          _reshape;
-    unsigned int                            _num_of_stages;
-    unsigned int                            _reduction_axis;
-    bool                                    _is_serial;
-    bool                                    _is_reshape_required;
+    MemoryGroup                                              _memory_group;
+    std::vector<CLTensor>                                    _results_vector;
+    std::vector<std::unique_ptr<CLReductionOperationKernel>> _reduction_kernels_vector;
+    std::vector<std::unique_ptr<CLFillBorderKernel>>         _border_handlers_vector;
+    CLReshapeLayer                                           _reshape;
+    unsigned int                                             _num_of_stages;
+    unsigned int                                             _reduction_axis;
+    bool                                                     _is_serial;
+    bool                                                     _is_reshape_required;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLREDUCTIONOPERATION_H */
\ No newline at end of file
diff --git a/arm_compute/runtime/CL/functions/CLRemap.h b/arm_compute/runtime/CL/functions/CLRemap.h
index 5b110d58f4..87d5f9fec7 100644
--- a/arm_compute/runtime/CL/functions/CLRemap.h
+++ b/arm_compute/runtime/CL/functions/CLRemap.h
@@ -31,12 +31,16 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to execute remap. This function calls the following OpenCL kernels:
  *
  * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref CLRemapKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLRemap : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLReorgLayer.h b/arm_compute/runtime/CL/functions/CLReorgLayer.h
index a7287ce266..0840fd13fd 100644
--- a/arm_compute/runtime/CL/functions/CLReorgLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReorgLayer.h
@@ -29,7 +29,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 class CLReorgLayer : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLReshapeLayer.h b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
index 7fc6c3b864..b4d52ec8cf 100644
--- a/arm_compute/runtime/CL/functions/CLReshapeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
@@ -29,7 +29,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLReshapeLayerKernel */
 class CLReshapeLayer : public IFunction
diff --git a/arm_compute/runtime/CL/functions/CLReverse.h b/arm_compute/runtime/CL/functions/CLReverse.h
index 6b140920e9..81fa04b1f5 100644
--- a/arm_compute/runtime/CL/functions/CLReverse.h
+++ b/arm_compute/runtime/CL/functions/CLReverse.h
@@ -24,11 +24,14 @@
 #ifndef ARM_COMPUTE_CLREVERSE_H
 #define ARM_COMPUTE_CLREVERSE_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLReverseKernel */
 class CLReverse : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h
index d776e83035..360d63ea22 100644
--- a/arm_compute/runtime/CL/functions/CLScale.h
+++ b/arm_compute/runtime/CL/functions/CLScale.h
@@ -32,7 +32,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLScaleKernel */
 class CLScale : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLScharr3x3.h b/arm_compute/runtime/CL/functions/CLScharr3x3.h
index 3892874f35..4c747af19e 100644
--- a/arm_compute/runtime/CL/functions/CLScharr3x3.h
+++ b/arm_compute/runtime/CL/functions/CLScharr3x3.h
@@ -31,6 +31,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to execute scharr 3x3 filter. This function calls the following OpenCL kernels:
@@ -38,6 +39,8 @@ class ICLTensor;
  * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref CLScharr3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLScharr3x3 : public ICLSimpleFunction
 {
diff --git a/arm_compute/runtime/CL/functions/CLSelect.h b/arm_compute/runtime/CL/functions/CLSelect.h
index a1af922303..7fd52312fb 100644
--- a/arm_compute/runtime/CL/functions/CLSelect.h
+++ b/arm_compute/runtime/CL/functions/CLSelect.h
@@ -24,14 +24,15 @@
 #ifndef ARM_COMPUTE_CLSELECT_H
 #define ARM_COMPUTE_CLSELECT_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLSelect */
 class CLSelect : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLSlice.h b/arm_compute/runtime/CL/functions/CLSlice.h
index 23c398cb41..f17e77236d 100644
--- a/arm_compute/runtime/CL/functions/CLSlice.h
+++ b/arm_compute/runtime/CL/functions/CLSlice.h
@@ -31,6 +31,8 @@ namespace arm_compute
 {
 // Forward Declarations
 class ICLTensor;
+class CLCompileContext;
+class ITensorInfo;
 
 namespace experimental
 {
diff --git a/arm_compute/runtime/CL/functions/CLSobel3x3.h b/arm_compute/runtime/CL/functions/CLSobel3x3.h
index 25d4ed6895..1e5745374e 100644
--- a/arm_compute/runtime/CL/functions/CLSobel3x3.h
+++ b/arm_compute/runtime/CL/functions/CLSobel3x3.h
@@ -31,6 +31,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
 /** Basic function to execute sobel 3x3 filter. This function calls the following OpenCL kernels:
@@ -38,10 +39,20 @@ class ICLTensor;
  * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref CLSobel3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLSobel3x3 : public ICLSimpleFunction
 {
 public:
+    /** Default Constructor */
+    CLSobel3x3() = default;
+    /** Prevent instances of this class from being copied */
+    CLSobel3x3(const CLSobel3x3 &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLSobel3x3 &operator=(const CLSobel3x3 &) = delete;
+    /** Default destructor */
+    ~CLSobel3x3();
     /** Initialise the function's source, destinations and border mode.
      *
      * @note At least one of output_x or output_y must be not NULL.
diff --git a/arm_compute/runtime/CL/functions/CLSobel5x5.h b/arm_compute/runtime/CL/functions/CLSobel5x5.h
index 1f91c46f7f..e791d8a9e7 100644
--- a/arm_compute/runtime/CL/functions/CLSobel5x5.h
+++ b/arm_compute/runtime/CL/functions/CLSobel5x5.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLSOBEL5X5_H
 #define ARM_COMPUTE_CLSOBEL5X5_H
 
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -37,6 +35,10 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLFillBorderKernel;
+class CLSobel5x5HorKernel;
+class CLSobel5x5VertKernel;
 class ICLTensor;
 
 /** Basic function to execute sobel 5x5 filter. This function calls the following OpenCL kernels:
@@ -45,6 +47,8 @@ class ICLTensor;
  * -# @ref CLSobel5x5HorKernel
  * -# @ref CLSobel5x5VertKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class CLSobel5x5 : public IFunction
 {
@@ -54,6 +58,12 @@ class CLSobel5x5 : public IFunction
      * @param[in] memory_manager (Optional) Memory manager.
      */
     CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLSobel5x5(const CLSobel5x5 &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLSobel5x5 &operator=(const CLSobel5x5 &) = delete;
+    /** Default destructor */
+    ~CLSobel5x5();
     /** Initialise the function's source, destinations and border mode.
      *
      * @note At least one of output_x or output_y must be not NULL.
@@ -82,12 +92,12 @@ class CLSobel5x5 : public IFunction
     void run() override;
 
 protected:
-    MemoryGroup          _memory_group;   /**< Function's memory group */
-    CLSobel5x5HorKernel  _sobel_hor;      /**< Sobel Horizontal 5x5 kernel */
-    CLSobel5x5VertKernel _sobel_vert;     /**< Sobel Vertical 5x5 kernel */
-    CLFillBorderKernel   _border_handler; /**< Kernel to handle image borders */
-    CLImage              _tmp_x;          /**< Temporary buffer for Sobel X */
-    CLImage              _tmp_y;          /**< Temporary buffer for Sobel Y */
+    MemoryGroup                           _memory_group;   /**< Function's memory group */
+    std::unique_ptr<CLSobel5x5HorKernel>  _sobel_hor;      /**< Sobel Horizontal 5x5 kernel */
+    std::unique_ptr<CLSobel5x5VertKernel> _sobel_vert;     /**< Sobel Vertical 5x5 kernel */
+    std::unique_ptr<CLFillBorderKernel>   _border_handler; /**< Kernel to handle image borders */
+    CLImage                               _tmp_x;          /**< Temporary buffer for Sobel X */
+    CLImage                               _tmp_y;          /**< Temporary buffer for Sobel Y */
 };
 }
 #endif /*ARM_COMPUTE_CLSOBEL5X5_H */
diff --git a/arm_compute/runtime/CL/functions/CLSobel7x7.h b/arm_compute/runtime/CL/functions/CLSobel7x7.h
index 91daf64c29..65e8de55b4 100644
--- a/arm_compute/runtime/CL/functions/CLSobel7x7.h
+++ b/arm_compute/runtime/CL/functions/CLSobel7x7.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLSOBEL7X7_H
 #define ARM_COMPUTE_CLSOBEL7X7_H
 
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -37,6 +35,10 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLFillBorderKernel;
+class CLSobel7x7HorKernel;
+class CLSobel7x7VertKernel;
 class ICLTensor;
 
 /** Basic function to execute sobel 7x7 filter. This function calls the following OpenCL kernels:
@@ -45,6 +47,8 @@ class ICLTensor;
  * -# @ref CLSobel7x7HorKernel
  * -# @ref CLSobel7x7VertKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ * 
  */
 class CLSobel7x7 : public IFunction
 {
@@ -54,6 +58,12 @@ class CLSobel7x7 : public IFunction
      * @param[in] memory_manager (Optional) Memory manager.
      */
     CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLSobel7x7(const CLSobel7x7 &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLSobel7x7 &operator=(const CLSobel7x7 &) = delete;
+    /** Default destructor */
+    ~CLSobel7x7();
     /** Initialise the function's source, destinations and border mode.
      *
      * @note At least one of output_x or output_y must be not NULL.
@@ -82,12 +92,12 @@ class CLSobel7x7 : public IFunction
     void run() override;
 
 protected:
-    MemoryGroup          _memory_group;   /**< Function's memory group */
-    CLSobel7x7HorKernel  _sobel_hor;      /**< Sobel Horizontal 7x7 kernel */
-    CLSobel7x7VertKernel _sobel_vert;     /**< Sobel Vertical 7x7 kernel */
-    CLFillBorderKernel   _border_handler; /**< Kernel to handle image borders */
-    CLImage              _tmp_x;          /**< Temporary buffer for Sobel X */
-    CLImage              _tmp_y;          /**< Temporary buffer for Sobel Y */
+    MemoryGroup                           _memory_group;   /**< Function's memory group */
+    std::unique_ptr<CLSobel7x7HorKernel>  _sobel_hor;      /**< Sobel Horizontal 7x7 kernel */
+    std::unique_ptr<CLSobel7x7VertKernel> _sobel_vert;     /**< Sobel Vertical 7x7 kernel */
+    std::unique_ptr<CLFillBorderKernel>   _border_handler; /**< Kernel to handle image borders */
+    CLImage                               _tmp_x;          /**< Temporary buffer for Sobel X */
+    CLImage                               _tmp_y;          /**< Temporary buffer for Sobel Y */
 };
 }
 #endif /*ARM_COMPUTE_CLSOBEL7X7_H */
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index bb01584ff4..ab10a64de4 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -24,10 +24,8 @@
 #ifndef ARM_COMPUTE_CLSOFTMAXLAYER_H
 #define ARM_COMPUTE_CLSOFTMAXLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
-#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPermute.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -36,7 +34,11 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLLogits1DMaxShiftExpSumKernel;
+class CLLogits1DNormKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to compute a SoftmaxLayer.
  *
@@ -47,7 +49,10 @@ class ICLTensor;
  * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
  *
  * This function runs the following kernels:
+ * -# If axis is not 0:
+ * -#   @ref CLPermute
  * -# @ref CLLogits1DNormKernel
+ * -# @ref CLLogits1DMaxShiftExpSumKernel
  */
 template <bool IS_LOG = false>
 class CLSoftmaxLayerGeneric : public IFunction
@@ -55,75 +60,62 @@ class CLSoftmaxLayerGeneric : public IFunction
 public:
     /** Constructor */
     CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLSoftmaxLayerGeneric(const CLSoftmaxLayerGeneric &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLSoftmaxLayerGeneric &operator=(const CLSoftmaxLayerGeneric &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLSoftmaxLayerGeneric(CLSoftmaxLayerGeneric &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLSoftmaxLayerGeneric &operator=(CLSoftmaxLayerGeneric &&) = delete;
+    /** Default destructor */
+    ~CLSoftmaxLayerGeneric();
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[out] output Destination tensor. Data types supported: same as @p input
      * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
-     * @param[in]  axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
+     * @param[in]  axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      */
-    void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 0);
+    void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f, int32_t axis = 0);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[out] output          Destination tensor. Data types supported: same as @p input
      * @param[in]  beta            (Optional) A scaling factor for the exponent. Defaults to 1.f
-     * @param[in]  axis            (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
+     * @param[in]  axis            (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 0);
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta = 1.0f, int32_t axis = 0);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSoftmaxLayer
      *
      * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[in] output Destination tensor. Data types supported: same as @p input
      * @param[in] beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
-     * @param[in] axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
+     * @param[in] axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, size_t axis = 0);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, int32_t axis = 0);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    /** Utility method to configure the kernels needed to flatten the input
-     * tensor.
-     *
-     * @note This function changes the internal state of this class. In particular,
-     * it initializes the kernel @p _flatten_kernel and the tensors @p _input_flat and
-     * @p _output_flat
-     *
-     * @param[in] input  Original source tensor.
-     * @param[in] output Original destination tensor.
-     * @param[in] axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
-     */
-    void configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis);
-    /** Utility method to configure the kernels needed to flatten the input
-     * tensor.
-     *
-     * @note This function changes the internal state of this class. In particular,
-     * it initializes the kernel @p _flatten_kernel and the tensors @p _input_flat and
-     * @p _output_flat
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           Original source tensor.
-     * @param[in] output          Original destination tensor.
-     * @param[in] axis            (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
-     */
-    void configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t axis);
-
-    MemoryGroup                    _memory_group;
-    CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel;
-    CLLogits1DNormKernel           _norm_kernel;
-    std::unique_ptr<IFunction>     _flatten_ptr;
-    CLReshapeLayer                 _reshape;
-    CLTensor                       _max;
-    CLTensor                       _sum;
-    CLTensor                       _tmp;
-    CLTensor                       _input_flattened;
-    CLTensor                       _output_flattened;
-    bool                           _needs_flattening;
+    MemoryGroup                                     _memory_group;
+    CLPermute                                       _permute_input;
+    CLPermute                                       _permute_output;
+    std::unique_ptr<CLLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel;
+    std::unique_ptr<CLLogits1DNormKernel>           _norm_kernel;
+    CLTensor                                        _max;
+    CLTensor                                        _sum;
+    CLTensor                                        _tmp;
+    CLTensor                                        _input_permuted;
+    CLTensor                                        _output_permuted;
+    bool                                            _needs_permute;
 };
 
 using CLSoftmaxLayer    = CLSoftmaxLayerGeneric<false>;
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
index c6f7f11079..5c5e5bed9a 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
@@ -24,16 +24,19 @@
 #ifndef ARM_COMPUTE_CLSPACETOBATCHLAYER_H
 #define ARM_COMPUTE_CLSPACETOBATCHLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
-#include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLMemsetKernel;
+class CLSpaceToBatchLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to spatial divide a tensor. This function calls the following OpenCL kernels/functions:
  *
@@ -54,12 +57,12 @@ class CLSpaceToBatchLayer : public IFunction
     /** Allow instances of this class to be moved */
     CLSpaceToBatchLayer &operator=(CLSpaceToBatchLayer &&) = default;
     /** Default destructor */
-    virtual ~CLSpaceToBatchLayer() = default;
+    ~CLSpaceToBatchLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in]  block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in]  paddings    2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output      Tensor output. Data types supported: same as @p input
      */
     void configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
@@ -67,8 +70,8 @@ class CLSpaceToBatchLayer : public IFunction
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape     1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings        2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in]  block_shape     1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in]  paddings        2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
@@ -77,8 +80,8 @@ class CLSpaceToBatchLayer : public IFunction
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
-     * @param[in]  padding_left  The left padding of the output tensor.
-     * @param[in]  padding_right The right padding of the output tensor.
+     * @param[in]  padding_left  The padding at the beginning of every dimension of the output tensor.
+     * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
     void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
@@ -88,8 +91,8 @@ class CLSpaceToBatchLayer : public IFunction
      * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x   Block shape x value.
      * @param[in]  block_shape_y   Block shape y value.
-     * @param[in]  padding_left    The left padding of the output tensor.
-     * @param[in]  padding_right   The right padding of the output tensor.
+     * @param[in]  padding_left    The padding at the beginning of every dimension of the output tensor.
+     * @param[in]  padding_right   The padding at the end of every dimension of the output tensor.
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
@@ -97,8 +100,8 @@ class CLSpaceToBatchLayer : public IFunction
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayer
      *
      * @param[in]  input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape block shape tensor info with shape [M]. Data types supported: S32
-     * @param[in]  paddings    paddings tensor info with shape [2, M]. Data types supported: S32
+     * @param[in]  block_shape block shape tensor info with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in]  paddings    paddings tensor info with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output      Tensor output info. Data types supported: same as @p input
      *
      * @return a status
@@ -109,8 +112,8 @@ class CLSpaceToBatchLayer : public IFunction
      * @param[in]  input         Tensor input info. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
-     * @param[in]  padding_left  The left padding of the output tensor.
-     * @param[in]  padding_right The right padding of the output tensor.
+     * @param[in]  padding_left  The padding at the beginning of every dimension of the output tensor.
+     * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output info. Data types supported: same as @p input
      *
      * @return a status
@@ -121,9 +124,9 @@ class CLSpaceToBatchLayer : public IFunction
     void run() override;
 
 private:
-    CLSpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
-    CLMemsetKernel            _memset_kernel;         /**< Memset kernel to run */
-    bool                      _has_padding;           /**< Flag to check if the output has padding */
+    std::unique_ptr<CLSpaceToBatchLayerKernel> _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
+    std::unique_ptr<CLMemsetKernel>            _memset_kernel;         /**< Memset kernel to run */
+    bool                                       _has_padding;           /**< Flag to check if the output has padding */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLSPACETOBATCHLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
index 24830cf4d3..9e476fe7bd 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
@@ -24,14 +24,17 @@
 #ifndef ARM_COMPUTE_CLSPACETODEPTHLAYER_H
 #define ARM_COMPUTE_CLSPACETODEPTHLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
-#include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLSpaceToDepthLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLSpaceToDepthLayerKernel. */
 class CLSpaceToDepthLayer : public IFunction
@@ -39,6 +42,16 @@ class CLSpaceToDepthLayer : public IFunction
 public:
     /** Default constructor */
     CLSpaceToDepthLayer();
+    /** Prevent instances of this class from being copied */
+    CLSpaceToDepthLayer(const CLSpaceToDepthLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLSpaceToDepthLayer &operator=(const CLSpaceToDepthLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLSpaceToDepthLayer(CLSpaceToDepthLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLSpaceToDepthLayer &operator=(CLSpaceToDepthLayer &&) = delete;
+    /** Default destructor */
+    ~CLSpaceToDepthLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -68,7 +81,7 @@ class CLSpaceToDepthLayer : public IFunction
     void run() override;
 
 private:
-    CLSpaceToDepthLayerKernel _space_to_depth_kernel; /**< CLSpaceToDepthLayerKernel to run */
+    std::unique_ptr<CLSpaceToDepthLayerKernel> _space_to_depth_kernel; /**< CLSpaceToDepthLayerKernel to run */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLSPACETODEPTHLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLStackLayer.h b/arm_compute/runtime/CL/functions/CLStackLayer.h
index 95875962c8..3861fd299a 100644
--- a/arm_compute/runtime/CL/functions/CLStackLayer.h
+++ b/arm_compute/runtime/CL/functions/CLStackLayer.h
@@ -27,14 +27,15 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLStackLayerKernel.h"
-
 #include <memory>
 #include <vector>
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLStackLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to stack tensors along an axis. This function calls the following kernel:
  *
@@ -46,6 +47,16 @@ class CLStackLayer : public IFunction
 public:
     /** Default constructor */
     CLStackLayer();
+    /** Prevent instances of this class from being copied */
+    CLStackLayer(const CLStackLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLStackLayer &operator=(const CLStackLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLStackLayer(CLStackLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLStackLayer &operator=(CLStackLayer &&) = delete;
+    /** Default destructor */
+    ~CLStackLayer();
     /** Initialise the kernel's inputs vector and output.
      *
      * @note Supported input tensor rank: up to 4
@@ -84,9 +95,9 @@ class CLStackLayer : public IFunction
     void run() override;
 
 private:
-    std::vector<ICLTensor *>        _input;
-    std::vector<CLStackLayerKernel> _stack_kernels;
-    unsigned int                    _num_inputs;
+    std::vector<ICLTensor *>                         _input;
+    std::vector<std::unique_ptr<CLStackLayerKernel>> _stack_kernels;
+    unsigned int                                     _num_inputs;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLSTACKLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLTableLookup.h b/arm_compute/runtime/CL/functions/CLTableLookup.h
index 32d4b7bdf9..ca59309548 100644
--- a/arm_compute/runtime/CL/functions/CLTableLookup.h
+++ b/arm_compute/runtime/CL/functions/CLTableLookup.h
@@ -28,6 +28,7 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 class ICLLut;
 
diff --git a/arm_compute/runtime/CL/functions/CLThreshold.h b/arm_compute/runtime/CL/functions/CLThreshold.h
index f3af122f0a..c536232e71 100644
--- a/arm_compute/runtime/CL/functions/CLThreshold.h
+++ b/arm_compute/runtime/CL/functions/CLThreshold.h
@@ -33,9 +33,14 @@
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
 class ICLTensor;
 
-/** Basic function to run @ref CLThresholdKernel */
+/** Basic function to run @ref CLThresholdKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class CLThreshold : public ICLSimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/CL/functions/CLTile.h b/arm_compute/runtime/CL/functions/CLTile.h
index d2f1e9730c..69743693ff 100644
--- a/arm_compute/runtime/CL/functions/CLTile.h
+++ b/arm_compute/runtime/CL/functions/CLTile.h
@@ -24,13 +24,14 @@
 #ifndef ARM_COMPUTE_CLTILE_H
 #define ARM_COMPUTE_CLTILE_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLTileKernel */
 class CLTile : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLTranspose.h b/arm_compute/runtime/CL/functions/CLTranspose.h
index 9ba7cafce4..2b7a03f23f 100644
--- a/arm_compute/runtime/CL/functions/CLTranspose.h
+++ b/arm_compute/runtime/CL/functions/CLTranspose.h
@@ -24,11 +24,14 @@
 #ifndef ARM_COMPUTE_CLTRANSPOSE_H
 #define ARM_COMPUTE_CLTRANSPOSE_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to transpose a matrix on OpenCL. This function calls the following OpenCL kernel:
  *
diff --git a/arm_compute/runtime/CL/functions/CLUpsampleLayer.h b/arm_compute/runtime/CL/functions/CLUpsampleLayer.h
index 07b4c8aecb..88b293069d 100644
--- a/arm_compute/runtime/CL/functions/CLUpsampleLayer.h
+++ b/arm_compute/runtime/CL/functions/CLUpsampleLayer.h
@@ -26,13 +26,17 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
+class CLUpsampleLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLUpsampleLayerKernel */
 class CLUpsampleLayer : public IFunction
@@ -49,7 +53,7 @@ class CLUpsampleLayer : public IFunction
     /** Allow instances of this class to be moved */
     CLUpsampleLayer &operator=(CLUpsampleLayer &&) = default;
     /** Default destructor */
-    virtual ~CLUpsampleLayer() = default;
+    ~CLUpsampleLayer();
 
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
@@ -86,8 +90,8 @@ class CLUpsampleLayer : public IFunction
     void run() override;
 
 private:
-    CLUpsampleLayerKernel _upsample;
-    ICLTensor            *_output;
+    std::unique_ptr<CLUpsampleLayerKernel> _upsample;
+    ICLTensor                             *_output;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLUPSAMPLELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLWarpAffine.h b/arm_compute/runtime/CL/functions/CLWarpAffine.h
index eb7c05be84..2f73097fcf 100644
--- a/arm_compute/runtime/CL/functions/CLWarpAffine.h
+++ b/arm_compute/runtime/CL/functions/CLWarpAffine.h
@@ -31,9 +31,14 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
-/** Basic function to run @ref CLWarpAffineKernel for AFFINE transformation */
+/** Basic function to run @ref CLWarpAffineKernel for AFFINE transformation
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class CLWarpAffine : public ICLSimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/CL/functions/CLWarpPerspective.h b/arm_compute/runtime/CL/functions/CLWarpPerspective.h
index 2a1f78093e..4e2c81e71c 100644
--- a/arm_compute/runtime/CL/functions/CLWarpPerspective.h
+++ b/arm_compute/runtime/CL/functions/CLWarpPerspective.h
@@ -31,9 +31,14 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
-/** Basic function to run @ref CLWarpPerspectiveKernel for PERSPECTIVE transformation */
+/** Basic function to run @ref CLWarpPerspectiveKernel for PERSPECTIVE transformation
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class CLWarpPerspective : public ICLSimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
index 602f644230..9ced69c1bb 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
@@ -33,7 +31,11 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLWinogradFilterTransformKernel;
+class CLWinogradOutputTransformKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
  *
@@ -56,6 +58,8 @@ class CLWinogradConvolutionLayer : public IFunction
     CLWinogradConvolutionLayer &operator=(const CLWinogradConvolutionLayer &) = delete;
     /** Default move assignment operator */
     CLWinogradConvolutionLayer &operator=(CLWinogradConvolutionLayer &&) = default;
+    /** Default destructor */
+    ~CLWinogradConvolutionLayer();
     /** Set the input and output tensors.
      *
      * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
@@ -122,16 +126,16 @@ class CLWinogradConvolutionLayer : public IFunction
     void prepare() override;
 
 private:
-    MemoryGroup                     _memory_group;
-    CLGEMM                          _batched_mm;
-    CLWinogradInputTransform        _input_transform;
-    CLWinogradFilterTransformKernel _filter_transform;
-    CLWinogradOutputTransformKernel _output_transform;
-    CLTensor                        _input0;
-    CLTensor                        _input1;
-    CLTensor                        _batched_mm_output;
-    const ICLTensor                *_original_weights;
-    bool                            _is_prepared;
+    MemoryGroup                                      _memory_group;
+    CLGEMM                                           _batched_mm;
+    CLWinogradInputTransform                         _input_transform;
+    std::unique_ptr<CLWinogradFilterTransformKernel> _filter_transform;
+    std::unique_ptr<CLWinogradOutputTransformKernel> _output_transform;
+    CLTensor                                         _input0;
+    CLTensor                                         _input1;
+    CLTensor                                         _batched_mm_output;
+    const ICLTensor                                 *_original_weights;
+    bool                                             _is_prepared;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
index 351f88012f..8cd809cc1f 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
@@ -31,7 +31,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to execute a @ref CLWinogradInputTransformKernel. */
 class CLWinogradInputTransform : public ICLSimpleFunction
diff --git a/arm_compute/runtime/CL/functions/CLYOLOLayer.h b/arm_compute/runtime/CL/functions/CLYOLOLayer.h
index 3e403f44bd..48ee4ea4f7 100644
--- a/arm_compute/runtime/CL/functions/CLYOLOLayer.h
+++ b/arm_compute/runtime/CL/functions/CLYOLOLayer.h
@@ -24,13 +24,14 @@
 #ifndef ARM_COMPUTE_CLYOLOLAYER_H
 #define ARM_COMPUTE_CLYOLOLAYER_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLYOLOLayerKernel that performs a partial activation on the input
  *
diff --git a/arm_compute/runtime/CL/tuners/CLLWSList.h b/arm_compute/runtime/CL/tuners/CLLWSList.h
index 7ce10ac220..48f3f3f7c9 100644
--- a/arm_compute/runtime/CL/tuners/CLLWSList.h
+++ b/arm_compute/runtime/CL/tuners/CLLWSList.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
 #include "support/ToolchainSupport.h"
-#include <memory>
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/runtime/CL/tuners/Tuners.h b/arm_compute/runtime/CL/tuners/Tuners.h
index 274f13d4c3..dd1c62a252 100644
--- a/arm_compute/runtime/CL/tuners/Tuners.h
+++ b/arm_compute/runtime/CL/tuners/Tuners.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,8 @@
 #include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
 #include "arm_compute/runtime/CL/tuners/MidgardTuner.h"
 
+#include "support/MemorySupport.h"
+
 #include <memory>
 
 namespace arm_compute
diff --git a/arm_compute/runtime/CPP/CPPScheduler.h b/arm_compute/runtime/CPP/CPPScheduler.h
index e8ad427eba..764af818d9 100644
--- a/arm_compute/runtime/CPP/CPPScheduler.h
+++ b/arm_compute/runtime/CPP/CPPScheduler.h
@@ -62,7 +62,6 @@ class CPPScheduler final : public IScheduler
     void run_workloads(std::vector<Workload> &workloads) override;
 
 private:
-    void schedule_common(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors);
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
diff --git a/arm_compute/runtime/CPP/functions/CPPSplit.h b/arm_compute/runtime/CPP/functions/CPPSplit.h
index 7929f14046..b2b4d07c86 100644
--- a/arm_compute/runtime/CPP/functions/CPPSplit.h
+++ b/arm_compute/runtime/CPP/functions/CPPSplit.h
@@ -106,7 +106,10 @@ class CPPSplit : public IFunction
 
             // Output auto inizialitation if not yet initialized
             TensorInfo tmp_output_info = *output->clone();
-            auto_init_if_empty(tmp_output_info, input->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+            if(tmp_output_info.tensor_shape().total_size() == 0)
+            {
+                tmp_output_info = input->clone()->set_is_resizable(true).set_tensor_shape(output_shape);
+            }
 
             // Update coordinate on axis
             start_coords.set(axis, axis_offset);
diff --git a/arm_compute/runtime/FunctionDescriptors.h b/arm_compute/runtime/FunctionDescriptors.h
index 16d6c345e2..1f4216eb21 100644
--- a/arm_compute/runtime/FunctionDescriptors.h
+++ b/arm_compute/runtime/FunctionDescriptors.h
@@ -23,6 +23,9 @@
  */
 #ifndef ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
 #define ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
+
+#include "arm_compute/core/Types.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -48,5 +51,26 @@ struct FFT2DInfo
     unsigned int axis1{ 1 };                         /**< Axis to run second pass on. If same, multiple transforms are performed on single axis*/
     FFTDirection direction{ FFTDirection::Forward }; /**< Direction of the FFT. */
 };
+
+/** Descriptor used by the Convolution function */
+struct Conv2dInfo
+{
+    Conv2dInfo() = default;
+
+    Conv2dInfo(const PadStrideInfo       &conv_info,
+               const Size2D              &dilation,
+               const ActivationLayerInfo &act_info,
+               bool                       enable_fast_math,
+               unsigned int               num_groups)
+        : conv_info(conv_info), dilation(dilation), act_info(act_info), enable_fast_math(enable_fast_math), num_groups(num_groups)
+    {
+    }
+
+    PadStrideInfo       conv_info{};
+    Size2D              dilation{ 1U, 1U };
+    ActivationLayerInfo act_info{};
+    bool                enable_fast_math{ false };
+    unsigned int        num_groups{ 1 };
+};
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h
index 481fb19201..bbba8acd7e 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,9 @@ class ITensor;
  *
  * @note The tensor data types for the inputs must be U8.
  * @note The function calculates the absolute difference also when the 2 inputs have different tensor data types.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCAbsoluteDifference : public IGCSimpleFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h
index 79c7c0cc12..b09afbb81d 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,9 @@ class IGCTensor;
 /** Basic function to run @ref GCActivationLayerKernel
  *
  * @note The function simulates an activation layer with the specified activation function.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCActivationLayer : public IGCSimpleFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h
index 15d957e3ce..9a7527dcd5 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h
@@ -35,6 +35,9 @@ class IGCTensor;
  *
  * @note The tensor data type for the inputs must be F16.
  * @note The function performs an arithmetic addition between two tensors.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCArithmeticAddition : public IGCSimpleFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h
index d6bc6eec3d..75b4cdc628 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,8 @@ class IGCTensor;
  * Batch normalization is calculated by:
  * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f]
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCBatchNormalizationLayer : public IFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h
index 9661b368a0..fe24c07f63 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,9 @@ class IGCTensor;
  *
  * @note only axis z is supported
  * -# @ref GCDepthConcatenateLayerKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCConcatenateLayer : public IFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
index f80ffa2948..3cff9688eb 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,6 +44,9 @@ class IGCTensor;
 
 /** Function to reshape and transpose the weights. This function calls the following kernels:
  * -# @ref GCWeightsReshapeKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCConvolutionLayerReshapeWeights : public IFunction
 {
@@ -72,6 +75,9 @@ class GCConvolutionLayerReshapeWeights : public IFunction
  * -# @ref GCIm2ColKernel
  * -# @ref GCGEMMInterleave4x4Kernel
  * -# @ref GCCol2ImKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCConvolutionLayer : public IFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h
index 3cf4d548d1..82fe368d7d 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,8 @@ class IGCTensor;
  * -# @ref GCDepthwiseConvolutionLayer3x3Kernel
  * -# @ref GCFillBorderKernel (if pad_x or pad_y > 0)
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCDepthwiseConvolutionLayer3x3 : public IFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
index c206ec4822..f834802db6 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,6 +46,9 @@ class IGCTensor;
  *
  * @note Supported kernel size: 1x1, 3x3, and 5x5
  * @note This OpenGL ES implementation works with stride_x = 1 and 2
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCDirectConvolutionLayer : public IFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h
index 79af623486..4d551f2692 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,9 @@ class IGCTensor;
 /** Basic function to do dropout op. This function calls the following kernels:
  *
  *  -# @ref GCDropoutLayerKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCDropoutLayer : public IFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h
index 766e8114a0..1635db51e7 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,11 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to run @ref GCFillBorderKernel */
+/** Basic function to run @ref GCFillBorderKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+ */
 class GCFillBorder : public IGCSimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
index a13c74a683..f839a7db39 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,9 @@ namespace arm_compute
  *  -# @ref GCTransposeKernel
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCFullyConnectedLayerReshapeWeights : public IGCSimpleFunction
 {
@@ -60,6 +63,9 @@ class GCFullyConnectedLayerReshapeWeights : public IGCSimpleFunction
  *  -# @ref GCGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCFullyConnectedLayer : public IFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
index 9c1748bc63..653da4b981 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,6 +44,8 @@ class IGCTensor;
  *  -# @ref GCGEMMMatrixMultiplyKernel
  *  -# @ref GCGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0)
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCGEMM : public IFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h
index 67fc86d067..964c368316 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,8 @@ class ITensor;
  *
  *  -# @ref GCGEMMInterleave4x4Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCGEMMInterleave4x4 : public IGCSimpleFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h
index b3d6a28e0e..107590cf76 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,8 @@ namespace arm_compute
  *
  *  -# @ref GCGEMMTranspose1xWKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCGEMMTranspose1xW : public IGCSimpleFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h
index 2936402562..f8ee39c362 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,6 +44,9 @@ class IGCTensor;
  * -# @ref GCPixelWiseMultiplicationKernel
  * -# @ref GCFillBorderKernel
  * -# @ref GCNormalizationLayerKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCNormalizationLayer : public IFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h
index fd69ef7725..e097cb9291 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,9 @@ class IGCTensor;
 /** Basic function to run @ref GCNormalizePlanarYUVLayerKernel
  *
  *  @note The function simulates a NormalizePlanarYUV layer.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCNormalizePlanarYUVLayer : public IFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h
index bbcc6630e6..e09ce374ff 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h
@@ -31,7 +31,11 @@ namespace arm_compute
 {
 class IGCTensor;
 
-/** Basic function to run @ref GCPixelWiseMultiplicationKernel. */
+/** Basic function to run @ref GCPixelWiseMultiplicationKernel.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+ */
 class GCPixelWiseMultiplication : public IGCSimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
index d6a79b5bc1..dcb7e81b9b 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
@@ -41,6 +41,9 @@ class IGCTensor;
  *
  * -# @ref GCFillBorderKernel (executed if padding size is different from zero)
  * -# @ref GCPoolingLayerKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCPoolingLayer : public IFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h
index 5a610f255f..17cfa565a9 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h
@@ -33,7 +33,11 @@ namespace arm_compute
 {
 class IGCTensor;
 
-/** Basic function to run @ref GCScaleKernel */
+/** Basic function to run @ref GCScaleKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+ */
 class GCScale : public IGCSimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
index 4ccfe2684e..083b07c57c 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
@@ -42,6 +42,9 @@ class IGCTensor;
  * -# @ref GCLogits1DMaxKernel
  * -# @ref GCLogits1DShiftExpSumKernel
  * -# @ref GCLogits1DNormKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCSoftmaxLayer : public IFunction
 {
@@ -50,17 +53,15 @@ class GCSoftmaxLayer : public IFunction
     GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input           Source tensor. Data types supported: F16/F32
-     * @param[out] output          Destination tensor. Data types supported: same as @p input
-     * @param[in]  beta            (Optional) A scaling factor for the exponent. Only beta = 1 is supported
-     * @param[in]  reduce_end_axis (Optional) The last axis of the first n dimensions (inclusive)to reduce. Defaults to 0.
-     *                   It has the purpose of squashing together the first n dimensions till (including) the @p reduce_end_axis. For instance, given a [2x3x4x5] image,
-     *                   when @p reduce_end_axis is 1, the reduction will be applied to axes 0 and 1, and the Softmax op will be applied on each of the [2x3] planes of the input image.
-     *                   Must be in range [0, input_num_dimensions).
+     * @param[in]  input  Source tensor. Data types supported: F16/F32
+     * @param[out] output Destination tensor. Data types supported: same as @p input
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. Only beta = 1 is supported
+     * @param[in]  axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      *
-     * @note The value of @p reduce_end_axis must be always 0 for GLES
+     * @note The value of @p axis must be always 0 for GLES
      */
-    void configure(const IGCTensor *input, IGCTensor *output, float beta = 1.0f, size_t reduce_end_axis = 0);
+    void configure(const IGCTensor *input, IGCTensor *output, float beta = 1.0f, int32_t axis = 0);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h
index 546f6d6e16..d25a322f4f 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,9 @@ class IGCTensor;
 /** Basic function to execute shift function for tensor. This function applies to fix alignment issue on OpenGL ES:
  *
  * @note This alignment issue is introduced by limits of compute shader which requires 32/64/128bit alignment for data access on OpenGL ES
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCTensorShift : public IGCSimpleFunction
 {
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h
index a37031bfee..84e303aa25 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,8 @@ class IGCTensor;
  *
  *  -# @ref GCTransposeKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class GCTranspose : public IGCSimpleFunction
 {
diff --git a/arm_compute/runtime/IOperator.h b/arm_compute/runtime/IOperator.h
index e7952bb748..fd285160e9 100644
--- a/arm_compute/runtime/IOperator.h
+++ b/arm_compute/runtime/IOperator.h
@@ -24,12 +24,13 @@
 #ifndef ARM_COMPUTE_IOPERATOR_H
 #define ARM_COMPUTE_IOPERATOR_H
 
-#include "arm_compute/runtime/IOperator.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 #include "arm_compute/runtime/Types.h"
 
 namespace arm_compute
 {
+class ITensorPack;
 namespace experimental
 {
 /** Base class for all async functions */
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index 98627538e8..309aee3bb5 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -205,6 +205,8 @@ class IScheduler
     virtual void run_workloads(std::vector<Workload> &workloads) = 0;
     CPUInfo _cpu_info;
 
+    void schedule_common(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors);
+
 private:
     unsigned int _num_threads_hint = {};
 };
diff --git a/arm_compute/runtime/ITransformWeights.h b/arm_compute/runtime/ITransformWeights.h
index 2e2e764c8e..9392be05e5 100644
--- a/arm_compute/runtime/ITransformWeights.h
+++ b/arm_compute/runtime/ITransformWeights.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_ITRANSFORMWEIGHTS_H
 
 #include <atomic>
+#include <utility>
 
 namespace arm_compute
 {
@@ -124,4 +125,4 @@ class ITransformWeights
 };
 } // arm_compute
 
-#endif /*ARM_COMPUTE_ITRANSFORMWEIGHTS_H */
\ No newline at end of file
+#endif /*ARM_COMPUTE_ITRANSFORMWEIGHTS_H */
diff --git a/arm_compute/runtime/Macros.h b/arm_compute/runtime/Macros.h
new file mode 100644
index 0000000000..aa019d104b
--- /dev/null
+++ b/arm_compute/runtime/Macros.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_MACROS_H
+#define ARM_COMPUTE_MACROS_H
+
+#define ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE_INC(TypeName) \
+    TypeName(const TypeName &) = delete;                   \
+    TypeName &operator=(const TypeName &) = delete;        \
+    TypeName(TypeName &&)                 = default;       \
+    TypeName &operator                    =(TypeName &&);
+
+#endif /* ARM_COMPUTE_MACROS_H */
diff --git a/arm_compute/runtime/NEON/INEOperator.h b/arm_compute/runtime/NEON/INEOperator.h
index 415e767eec..a5ffc74940 100644
--- a/arm_compute/runtime/NEON/INEOperator.h
+++ b/arm_compute/runtime/NEON/INEOperator.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_INEOPERATOR_H
 
 #include "../../core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/runtime/IOperator.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 #include "arm_compute/runtime/Types.h"
@@ -34,6 +33,8 @@
 
 namespace arm_compute
 {
+class ICPPKernel;
+using INEKernel = ICPPKernel;
 namespace experimental
 {
 /** Basic interface for functions which have a single async NEON kernel */
@@ -53,6 +54,8 @@ class INEOperator : public IOperator
     INEOperator &operator=(const INEOperator &) = delete;
     /** Default move assignment operator */
     INEOperator &operator=(INEOperator &&) = default;
+    /** Default destructor */
+    ~INEOperator();
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/arm_compute/runtime/NEON/INESimpleFunction.h b/arm_compute/runtime/NEON/INESimpleFunction.h
index 7f2ed2e16f..979a0f7f07 100644
--- a/arm_compute/runtime/NEON/INESimpleFunction.h
+++ b/arm_compute/runtime/NEON/INESimpleFunction.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,27 +24,38 @@
 #ifndef ARM_COMPUTE_INESIMPLEFUNCTION_H
 #define ARM_COMPUTE_INESIMPLEFUNCTION_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
 namespace arm_compute
 {
+class ICPPKernel;
+class NEFillBorderKernel;
+using INEKernel = ICPPKernel;
 /** Basic interface for functions which have a single NEON kernel */
 class INESimpleFunction : public IFunction
 {
 public:
     /** Constructor */
     INESimpleFunction();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INESimpleFunction(const INESimpleFunction &) = delete;
+    /** Default move constructor */
+    INESimpleFunction(INESimpleFunction &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INESimpleFunction &operator=(const INESimpleFunction &) = delete;
+    /** Default move assignment operator */
+    INESimpleFunction &operator=(INESimpleFunction &&) = default;
+    /** Default destructor */
+    ~INESimpleFunction();
 
     // Inherited methods overridden:
     void run() override final;
 
 protected:
-    std::unique_ptr<INEKernel> _kernel;         /**< Kernel to run */
-    NEFillBorderKernel         _border_handler; /**< Kernel to handle image borders */
+    std::unique_ptr<INEKernel>          _kernel;         /**< Kernel to run */
+    std::unique_ptr<NEFillBorderKernel> _border_handler; /**< Kernel to handle image borders */
 };
 }
 #endif /*ARM_COMPUTE_INESIMPLEFUNCTION_H */
diff --git a/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h b/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h
index 7d352eb82b..9df0d78526 100644
--- a/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h
+++ b/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_INESIMPLEFUNCTIONNOBORDER_H
 #define ARM_COMPUTE_INESIMPLEFUNCTIONNOBORDER_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 
@@ -32,6 +31,8 @@
 
 namespace arm_compute
 {
+class ICPPKernel;
+using INEKernel = ICPPKernel;
 /** Basic interface for functions which have a single NEON kernel and no border */
 class INESimpleFunctionNoBorder : public IFunction
 {
@@ -49,6 +50,8 @@ class INESimpleFunctionNoBorder : public IFunction
     INESimpleFunctionNoBorder &operator=(const INESimpleFunctionNoBorder &) = delete;
     /** Default move assignment operator */
     INESimpleFunctionNoBorder &operator=(INESimpleFunctionNoBorder &&) = default;
+    /** Default destructor */
+    ~INESimpleFunctionNoBorder();
 
     // Inherited methods overridden:
     void run() override final;
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 763294e7da..3952d499de 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -78,9 +78,9 @@
 #include "arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
@@ -104,6 +104,7 @@
 #include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
 #include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
 #include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NELogical.h"
 #include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
 #include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
@@ -136,7 +137,6 @@
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
 #include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
 #include "arm_compute/runtime/NEON/functions/NESelect.h"
-#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
 #include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
 #include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
diff --git a/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h b/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h
index 7b35e6db9e..f00b144475 100644
--- a/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h
+++ b/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEABSOLUTEDIFFERENCE_H
 #define ARM_COMPUTE_NEABSOLUTEDIFFERENCE_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
@@ -34,10 +34,25 @@ class ITensor;
  *
  * @note The image data type for the inputs must be U8 or S16
  * @note The function calculates the absolute difference also when the 2 inputs have different image data types
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
-class NEAbsoluteDifference : public INESimpleFunction
+class NEAbsoluteDifference : public INESimpleFunctionNoBorder
 {
 public:
+    /** Default constructor */
+    NEAbsoluteDifference() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAbsoluteDifference(const NEAbsoluteDifference &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAbsoluteDifference &operator=(const NEAbsoluteDifference &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEAbsoluteDifference(NEAbsoluteDifference &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEAbsoluteDifference &operator=(NEAbsoluteDifference &&) = delete;
+    /** Default destructor */
+    ~NEAbsoluteDifference();
     /** Set the inputs and output images
      *
      * @param[in]  input1 Source tensor. Data types supported: U8/S16.
diff --git a/arm_compute/runtime/NEON/functions/NEAccumulate.h b/arm_compute/runtime/NEON/functions/NEAccumulate.h
index f403a7772b..1881411880 100644
--- a/arm_compute/runtime/NEON/functions/NEAccumulate.h
+++ b/arm_compute/runtime/NEON/functions/NEAccumulate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,10 +32,26 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to run @ref NEAccumulateKernel */
+/** Basic function to run @ref NEAccumulateKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class NEAccumulate : public INESimpleFunctionNoBorder
 {
 public:
+    /** Default constructor */
+    NEAccumulate() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulate(const NEAccumulate &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulate &operator=(const NEAccumulate &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEAccumulate(NEAccumulate &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEAccumulate &operator=(NEAccumulate &&) = delete;
+    /** Default destructor */
+    ~NEAccumulate();
     /** Set the input and accumulation tensors
      *
      * @param[in]  input  Source tensor. Data type supported: U8.
@@ -44,10 +60,26 @@ class NEAccumulate : public INESimpleFunctionNoBorder
     void configure(const ITensor *input, ITensor *output);
 };
 
-/** Basic function to run @ref NEAccumulateWeightedKernel */
+/** Basic function to run @ref NEAccumulateWeightedKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class NEAccumulateWeighted : public INESimpleFunctionNoBorder
 {
 public:
+    /** Default constructor */
+    NEAccumulateWeighted() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateWeighted(const NEAccumulateWeighted &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateWeighted &operator=(const NEAccumulateWeighted &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEAccumulateWeighted(NEAccumulateWeighted &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEAccumulateWeighted &operator=(NEAccumulateWeighted &&) = delete;
+    /** Default destructor */
+    ~NEAccumulateWeighted();
     /** Set the input and accumulation tensors, and the scale value
      *
      * @param[in]     input    Source tensor. Data type supported: U8.
@@ -58,10 +90,26 @@ class NEAccumulateWeighted : public INESimpleFunctionNoBorder
     void configure(const ITensor *input, float alpha, ITensor *output, bool use_fp16 = false);
 };
 
-/** Basic function to run @ref NEAccumulateSquaredKernel */
+/** Basic function to run @ref NEAccumulateSquaredKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class NEAccumulateSquared : public INESimpleFunctionNoBorder
 {
 public:
+    /** Default constructor */
+    NEAccumulateSquared() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateSquared(const NEAccumulateSquared &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateSquared &operator=(const NEAccumulateSquared &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEAccumulateSquared(NEAccumulateSquared &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEAccumulateSquared &operator=(NEAccumulateSquared &&) = delete;
+    /** Default destructor */
+    ~NEAccumulateSquared();
     /** Set the input and accumulation tensors and the shift value.
      *
      * @param[in]     input  Source tensor. Data type supported: U8.
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index cfece5c392..3f410fcd8c 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -46,8 +46,6 @@ class NEActivationLayer : public IFunction
      * @param[in] ctx Runtime context to be used by the function
      */
     NEActivationLayer(IRuntimeContext *ctx = nullptr);
-    /** Destructor */
-    ~NEActivationLayer();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEActivationLayer(const NEActivationLayer &) = delete;
     /** Default move constructor */
@@ -56,6 +54,8 @@ class NEActivationLayer : public IFunction
     NEActivationLayer &operator=(const NEActivationLayer &) = delete;
     /** Default move assignment operator */
     NEActivationLayer &operator=(NEActivationLayer &&);
+    /** Destructor */
+    ~NEActivationLayer();
     /** [NEActivationLayer snippet] **/
     /** Set the input and output tensor.
      *
@@ -93,6 +93,19 @@ namespace experimental
 class NEActivationLayer : public INEOperator
 {
 public:
+    /** Constructor */
+    NEActivationLayer() = default;
+    /** Prevent instances of this class from being copied */
+    NEActivationLayer(const NEActivationLayer &) = delete;
+    /** Default move constructor */
+    NEActivationLayer(NEActivationLayer &&) = default;
+    /** Prevent instances of this class from being copied */
+    NEActivationLayer &operator=(const NEActivationLayer &) = delete;
+    /** Default move assignment operator */
+    NEActivationLayer &operator=(NEActivationLayer &&) = default;
+    /** Destructor */
+    ~NEActivationLayer();
+
     /** Set the input and output tensor.
      *
      * @param[in]  input           Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
index 61762f37e1..4b13d1f44e 100644
--- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -52,6 +52,16 @@ class NEArgMinMaxLayer : public IFunction
 public:
     /** Constructor */
     NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArgMinMaxLayer(const NEArgMinMaxLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArgMinMaxLayer &operator=(const NEArgMinMaxLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEArgMinMaxLayer(NEArgMinMaxLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEArgMinMaxLayer &operator=(NEArgMinMaxLayer &&) = delete;
+    /** Default destructor */
+    ~NEArgMinMaxLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input  Input source tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32.
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index e10771ef4b..6aaa5ff4f7 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -38,6 +38,18 @@ namespace experimental
 class NEArithmeticAddition : public INEOperator
 {
 public:
+    /** Constructor */
+    NEArithmeticAddition() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAddition(const NEArithmeticAddition &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAddition &operator=(const NEArithmeticAddition &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEArithmeticAddition(NEArithmeticAddition &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEArithmeticAddition &operator=(NEArithmeticAddition &&) = delete;
+    /** Default destructor */
+    ~NEArithmeticAddition();
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * Valid configurations (Input1,Input2) -> Output :
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index a38335c59b..5d2475b3a4 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -36,7 +36,7 @@ namespace experimental
 {
 /** Basic function to run @ref NEArithmeticSubtractionKernel
  *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/F32.
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
  * @note The function performs an arithmetic subtraction between two tensors.
  *
  *  This function calls the following kernels:
@@ -56,12 +56,13 @@ class NEArithmeticSubtraction : public INEOperator
      *   - (S16,U8)                         -> S16
      *   - (U8,S16)                         -> S16
      *   - (S16,S16)                        -> S16
+     *   - (S32,S32)                        -> S32
      *   - (F16,F16)                        -> F16
      *   - (F32,F32)                        -> F32
      *
-     * @param[in]  input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in]  input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[out] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[in]  input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in]  input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[out] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
      * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
@@ -77,12 +78,13 @@ class NEArithmeticSubtraction : public INEOperator
      *   - (S16,U8)                         -> S16
      *   - (U8,S16)                         -> S16
      *   - (S16,S16)                        -> S16
+     *   - (S32,S32)                        -> S32
      *   - (F16,F16)                        -> F16
      *   - (F32,F32)                        -> F32
      *
-     * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[in] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[in] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
+     * @param[in] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
+     * @param[in] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
      * @param[in] policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      *
@@ -94,7 +96,7 @@ class NEArithmeticSubtraction : public INEOperator
 
 /** Basic function to run @ref NEArithmeticSubtractionKernel
  *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/F32.
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
  * @note The function performs an arithmetic subtraction between two tensors.
  *
  *  This function calls the following kernels:
@@ -117,18 +119,18 @@ class NEArithmeticSubtraction : public IFunction
     NEArithmeticSubtraction &operator=(NEArithmeticSubtraction &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
      * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction
      *
-     * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[in] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[in] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
+     * @param[in] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
+     * @param[in] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
      * @param[in] policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      *
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
index 1f77164a43..6d56a267a7 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,16 @@
 #ifndef ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H
 #define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 class ITensor;
+class NEBatchNormalizationLayerKernel;
 
 /** Basic function to run @ref NENormalizationLayerKernel and simulate a batch normalization layer.
  *
@@ -42,8 +44,18 @@ class ITensor;
 class NEBatchNormalizationLayer : public IFunction
 {
 public:
-    /** Default constructor */
+    /** Constructor */
     NEBatchNormalizationLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayer(const NEBatchNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayer &operator=(const NEBatchNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEBatchNormalizationLayer(NEBatchNormalizationLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEBatchNormalizationLayer &operator=(NEBatchNormalizationLayer &&) = delete;
+    /** Default destructor */
+    ~NEBatchNormalizationLayer();
     /** Set the input and output tensors.
      *
      * @note If the output tensor is a nullptr or is equal to the input, the batch normalization function will be performed in-place
@@ -85,7 +97,7 @@ class NEBatchNormalizationLayer : public IFunction
     void run() override;
 
 private:
-    NEBatchNormalizationLayerKernel _norm_kernel; /**< Batch normalization layer kernel */
+    std::unique_ptr<NEBatchNormalizationLayerKernel> _norm_kernel; /**< Batch normalization layer kernel */
 };
 }
 #endif /* ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
index 1a6ffa9506..c2fd26d34c 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,18 +26,30 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEBatchToSpaceLayerKernel. */
 class NEBatchToSpaceLayer : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEBatchToSpaceLayer() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchToSpaceLayer(const NEBatchToSpaceLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchToSpaceLayer &operator=(const NEBatchToSpaceLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEBatchToSpaceLayer(NEBatchToSpaceLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEBatchToSpaceLayer &operator=(NEBatchToSpaceLayer &&) = delete;
+    /** Default destructor */
+    ~NEBatchToSpaceLayer() = default;
     /** Set the input and output tensors.
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
index c612a146ac..3203d2b9a7 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,18 @@ class ITensor;
 class NEBitwiseAnd : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEBitwiseAnd() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseAnd(const NEBitwiseAnd &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseAnd &operator=(const NEBitwiseAnd &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEBitwiseAnd(NEBitwiseAnd &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEBitwiseAnd &operator=(NEBitwiseAnd &&) = delete;
+    /** Default destructor */
+    ~NEBitwiseAnd() = default;
     /** Initialise the kernel's inputs and output
      *
      * @param[in]  input1 First tensor input. Data type supported: U8.
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseNot.h b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
index f6ef975dc7..9fa0d38caf 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseOr.h b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
index 8fc4b0d362..fba6b784de 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseXor.h b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
index 20e23af234..c6cb584284 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
index 14d5de4ca4..de8dfef4ed 100644
--- a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
+++ b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
@@ -24,19 +24,20 @@
 #ifndef ARM_COMPUTE_NEBOUNDINGBOXTRANSOFORM_H
 #define ARM_COMPUTE_NEBOUNDINGBOXTRANSOFORM_H
 
-#include "arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEBoundingBoxTransformKernel.
  *
  * This function calls the following Neon kernels:
  * -# @ref NEBoundingBoxTransformKernel
  */
-class NEBoundingBoxTransform : public INESimpleFunction
+class NEBoundingBoxTransform : public INESimpleFunctionNoBorder
 {
 public:
     /** Set the input and output tensors.
diff --git a/arm_compute/runtime/NEON/functions/NEBox3x3.h b/arm_compute/runtime/NEON/functions/NEBox3x3.h
index 80cd5084ab..d65c2be885 100644
--- a/arm_compute/runtime/NEON/functions/NEBox3x3.h
+++ b/arm_compute/runtime/NEON/functions/NEBox3x3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,8 @@ class ITensor;
  *  -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  *  -# @ref NEBox3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEBox3x3 : public INESimpleFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NECannyEdge.h b/arm_compute/runtime/NEON/functions/NECannyEdge.h
index f171c3bed0..7cdb8ee38e 100644
--- a/arm_compute/runtime/NEON/functions/NECannyEdge.h
+++ b/arm_compute/runtime/NEON/functions/NECannyEdge.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_NECANNYEDGE_H
 #define ARM_COMPUTE_NECANNYEDGE_H
 
-#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -38,6 +36,10 @@
 namespace arm_compute
 {
 class ITensor;
+class NEGradientKernel;
+class NEFillBorderKernel;
+class NEEdgeNonMaxSuppressionKernel;
+class NEEdgeTraceKernel;
 
 /** Basic function to execute canny edge on NEON. This function calls the following NEON kernels and functions:
  *
@@ -49,6 +51,9 @@ class ITensor;
  *  -# @ref NEEdgeNonMaxSuppressionKernel
  *  -# @ref NEEdgeTraceKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+ *
  */
 class NECannyEdge : public IFunction
 {
@@ -64,6 +69,8 @@ class NECannyEdge : public IFunction
     NECannyEdge(const NECannyEdge &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NECannyEdge &operator=(const NECannyEdge &) = delete;
+    /** Default destructor */
+    ~NECannyEdge();
     /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode.
      *
      * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -81,19 +88,19 @@ class NECannyEdge : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                   _memory_group;        /**< Function's memory group */
-    std::unique_ptr<IFunction>    _sobel;               /**< Pointer to Sobel kernel */
-    std::unique_ptr<INEKernel>    _gradient;            /**< Gradient kernel */
-    NEEdgeNonMaxSuppressionKernel _non_max_suppr;       /**< Non-Maxima suppression kernel */
-    NEEdgeTraceKernel             _edge_trace;          /**< Edge tracing kernel */
-    NEFillBorderKernel            _border_mag_gradient; /**< Fill border on magnitude tensor kernel */
-    NEFillBorderKernel            _border_edge_trace;   /**< Fill border before edge trace */
-    Tensor                        _gx;                  /**< Source tensor - Gx component */
-    Tensor                        _gy;                  /**< Source tensor - Gy component */
-    Tensor                        _magnitude;           /**< Source tensor - Magnitude */
-    Tensor                        _phase;               /**< Source tensor - Phase */
-    Tensor                        _nonmax;              /**< Source tensor - Non-Maxima suppressed */
-    ITensor                      *_output;              /**< Output tensor provided by the user. */
+    MemoryGroup                                    _memory_group;        /**< Function's memory group */
+    std::unique_ptr<IFunction>                     _sobel;               /**< Pointer to Sobel kernel */
+    std::unique_ptr<NEGradientKernel>              _gradient;            /**< Gradient kernel */
+    std::unique_ptr<NEEdgeNonMaxSuppressionKernel> _non_max_suppr;       /**< Non-Maxima suppression kernel */
+    std::unique_ptr<NEEdgeTraceKernel>             _edge_trace;          /**< Edge tracing kernel */
+    std::unique_ptr<NEFillBorderKernel>            _border_mag_gradient; /**< Fill border on magnitude tensor kernel */
+    std::unique_ptr<NEFillBorderKernel>            _border_edge_trace;   /**< Fill border before edge trace */
+    Tensor                                         _gx;                  /**< Source tensor - Gx component */
+    Tensor                                         _gy;                  /**< Source tensor - Gy component */
+    Tensor                                         _magnitude;           /**< Source tensor - Magnitude */
+    Tensor                                         _phase;               /**< Source tensor - Phase */
+    Tensor                                         _nonmax;              /**< Source tensor - Non-Maxima suppressed */
+    ITensor                                       *_output;              /**< Output tensor provided by the user. */
 };
 }
 #endif /* ARM_COMPUTE_NECANNYEDGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NECast.h b/arm_compute/runtime/NEON/functions/NECast.h
index ca818bea27..e536317660 100644
--- a/arm_compute/runtime/NEON/functions/NECast.h
+++ b/arm_compute/runtime/NEON/functions/NECast.h
@@ -25,16 +25,17 @@
 #define ARM_COMPUTE_NECAST_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEDepthConvertLayerKernel.
  * This function ignores the scale and zeroPoint of quanized tensors,so QASYMM8 input is treated as uint8 values.
  */
-class NECast : public INESimpleFunction
+class NECast : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialize the function's source, destination
diff --git a/arm_compute/runtime/NEON/functions/NEChannelCombine.h b/arm_compute/runtime/NEON/functions/NEChannelCombine.h
index c4ced62e72..c4ead73343 100644
--- a/arm_compute/runtime/NEON/functions/NEChannelCombine.h
+++ b/arm_compute/runtime/NEON/functions/NEChannelCombine.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,11 @@ class IMultiImage;
 class ITensor;
 using IImage = ITensor;
 
-/**Basic function to run @ref NEChannelCombineKernel to perform channel combination. */
+/**Basic function to run @ref NEChannelCombineKernel to perform channel combination.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class NEChannelCombine : public INESimpleFunctionNoBorder
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEChannelExtract.h b/arm_compute/runtime/NEON/functions/NEChannelExtract.h
index 54059e91e1..99522d2d74 100644
--- a/arm_compute/runtime/NEON/functions/NEChannelExtract.h
+++ b/arm_compute/runtime/NEON/functions/NEChannelExtract.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,11 @@ class IMultiImage;
 class ITensor;
 using IImage = ITensor;
 
-/**Basic function to run @ref NEChannelExtractKernel to perform channel extraction. */
+/**Basic function to run @ref NEChannelExtractKernel to perform channel extraction.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class NEChannelExtract : public INESimpleFunctionNoBorder
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
index f31518e85b..aa11396c20 100644
--- a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,12 +24,14 @@
 #ifndef ARM_COMPUTE_NECHANNELSHUFFLELAYER_H
 #define ARM_COMPUTE_NECHANNELSHUFFLELAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEChannelShuffleLayerKernel
  *
diff --git a/arm_compute/runtime/NEON/functions/NECol2Im.h b/arm_compute/runtime/NEON/functions/NECol2Im.h
index e03ec42c4f..69459a83c1 100644
--- a/arm_compute/runtime/NEON/functions/NECol2Im.h
+++ b/arm_compute/runtime/NEON/functions/NECol2Im.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,12 +26,13 @@
 
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NECol2Im */
 class NECol2Im : public INESimpleFunctionNoBorder
diff --git a/arm_compute/runtime/NEON/functions/NEColorConvert.h b/arm_compute/runtime/NEON/functions/NEColorConvert.h
index b4c4158804..8974aa63a1 100644
--- a/arm_compute/runtime/NEON/functions/NEColorConvert.h
+++ b/arm_compute/runtime/NEON/functions/NEColorConvert.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,11 @@ class ITensor;
 class IMultiImage;
 using IImage = ITensor;
 
-/**Basic function to run @ref NEColorConvertKernel to perform color conversion */
+/**Basic function to run @ref NEColorConvertKernel to perform color conversion
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+ */
 class NEColorConvert : public INESimpleFunctionNoBorder
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h b/arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h
index 44f3f860cf..b63243fec6 100644
--- a/arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h
+++ b/arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,19 +24,20 @@
 #ifndef ARM_COMPUTE_NECOMPUTEALLANCHORS_H
 #define ARM_COMPUTE_NECOMPUTEALLANCHORS_H
 
-#include "arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEComputeAllAnchorsKernel.
  *
  * This function calls the following NEON kernels:
  * -# @ref NEComputeAllAnchorsKernel
  */
-class NEComputeAllAnchors : public INESimpleFunction
+class NEComputeAllAnchors : public INESimpleFunctionNoBorder
 {
 public:
     /** Set the input and output tensors.
diff --git a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
index 1d703ae729..fd35d0bc46 100644
--- a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
@@ -26,10 +26,9 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Requires.h"
 #include "arm_compute/runtime/NEON/INEOperator.h"
+#include "support/Requires.h"
 
 #include <memory>
 #include <vector>
@@ -106,8 +105,18 @@ namespace experimental
 class NEConcatenation : public INEOperator
 {
 public:
-    /** Default constructor */
+    /** Constructor */
     NEConcatenation();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConcatenation(const NEConcatenation &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConcatenation &operator=(const NEConcatenation &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConcatenation(NEConcatenation &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConcatenation &operator=(NEConcatenation &&) = delete;
+    /** Default destructor */
+    ~NEConcatenation() = default;
     /** Initialise the kernel's inputs vector and output.
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
@@ -135,9 +144,9 @@ class NEConcatenation : public INEOperator
     void run(ITensorPack &tensors) override;
 
 private:
-    std::vector<std::unique_ptr<INEKernel>> _concat_kernels;
-    unsigned int                            _num_inputs;
-    unsigned int                            _axis;
+    std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels;
+    unsigned int                             _num_inputs;
+    unsigned int                             _axis;
 };
 } // namespace experimental
 } // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
index 42a62dc0ab..984e8d68c0 100644
--- a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,16 +24,17 @@
 #ifndef ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
 #define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
 
-#include "arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/ITransformWeights.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class NEConvertFullyConnectedWeightsKernel;
 
 /** Basic function to run @ref NEConvertFullyConnectedWeightsKernel. */
 class NEConvertFullyConnectedWeights : public IFunction
@@ -41,6 +42,16 @@ class NEConvertFullyConnectedWeights : public IFunction
 public:
     /** Default constructor */
     NEConvertFullyConnectedWeights();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvertFullyConnectedWeights(const NEConvertFullyConnectedWeights &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvertFullyConnectedWeights &operator=(const NEConvertFullyConnectedWeights &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvertFullyConnectedWeights(NEConvertFullyConnectedWeights &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvertFullyConnectedWeights &operator=(NEConvertFullyConnectedWeights &&) = delete;
+    /** Default destructor */
+    ~NEConvertFullyConnectedWeights();
     /** Initialize the function.
      *
      * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
@@ -64,7 +75,7 @@ class NEConvertFullyConnectedWeights : public IFunction
     void run() override;
 
 private:
-    NEConvertFullyConnectedWeightsKernel _kernel;
+    std::unique_ptr<NEConvertFullyConnectedWeightsKernel> _kernel;
 };
 
 namespace weights_transformations
diff --git a/arm_compute/runtime/NEON/functions/NEConvolution.h b/arm_compute/runtime/NEON/functions/NEConvolution.h
index eb16a4582e..afd654a595 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolution.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolution.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_NECONVOLUTION_H
 #define ARM_COMPUTE_NECONVOLUTION_H
 
-#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -39,16 +37,37 @@
 namespace arm_compute
 {
 class ITensor;
+class NEFillBorderKernel;
+template <unsigned int matrix_size>
+class NEConvolutionKernel;
+template <unsigned int matrix_size>
+class NESeparableConvolutionHorKernel;
+template <unsigned int matrix_size>
+class NESeparableConvolutionVertKernel;
 
 /** Basic function to execute convolution of size 3x3. This function calls the following NEON kernels:
  *
  * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref NEConvolution3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEConvolution3x3 : public INESimpleFunction
 {
 public:
+    /** Constructor */
+    NEConvolution3x3() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolution3x3(const NEConvolution3x3 &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolution3x3 &operator=(const NEConvolution3x3 &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolution3x3(NEConvolution3x3 &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolution3x3 &operator=(NEConvolution3x3 &&) = delete;
+    /** Default destructor */
+    ~NEConvolution3x3();
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -67,6 +86,8 @@ class NEConvolution3x3 : public INESimpleFunction
  * -# @ref NEConvolutionKernel or<br/>
  *    @ref NESeparableConvolutionHorKernel and @ref NESeparableConvolutionVertKernel (if convolution matrix is separable)
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 template <unsigned int matrix_size>
 class NEConvolutionSquare : public IFunction
@@ -74,6 +95,16 @@ class NEConvolutionSquare : public IFunction
 public:
     /** Default constructor */
     NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionSquare(const NEConvolutionSquare &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionSquare &operator=(const NEConvolutionSquare &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolutionSquare(NEConvolutionSquare &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolutionSquare &operator=(NEConvolutionSquare &&) = delete;
+    /** Default destructor */
+    ~NEConvolutionSquare();
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -89,13 +120,13 @@ class NEConvolutionSquare : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                                   _memory_group;   /**< Function memory group */
-    Tensor                                        _tmp;            /**< temporary buffer for output of horizontal pass */
-    bool                                          _is_separable;   /**< true if the convolution can be separated */
-    NESeparableConvolutionHorKernel<matrix_size>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
-    NESeparableConvolutionVertKernel<matrix_size> _kernel_vert;    /**< kernel for vertical pass of separated convolution */
-    NEConvolutionKernel<matrix_size>              _kernel;         /**< kernel for non-separated convolution **/
-    NEFillBorderKernel                            _border_handler; /**< kernel for border handling */
+    MemoryGroup                                                    _memory_group;   /**< Function memory group */
+    Tensor                                                         _tmp;            /**< temporary buffer for output of horizontal pass */
+    bool                                                           _is_separable;   /**< true if the convolution can be separated */
+    std::unique_ptr<NESeparableConvolutionHorKernel<matrix_size>>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
+    std::unique_ptr<NESeparableConvolutionVertKernel<matrix_size>> _kernel_vert;    /**< kernel for vertical pass of separated convolution */
+    std::unique_ptr<NEConvolutionKernel<matrix_size>>              _kernel;         /**< kernel for non-separated convolution **/
+    std::unique_ptr<NEFillBorderKernel>                            _border_handler; /**< kernel for border handling */
 };
 
 /** Basic function to run 5x5 convolution. */
@@ -111,10 +142,25 @@ using NEConvolution9x9 = NEConvolutionSquare<9>;
  * -# @ref NEConvolutionRectangleKernel or<br/>
  *
  * @note Convolution rectangle should have dimensions of 3, 5, 7, 9
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEConvolutionRectangle : public INESimpleFunction
 {
 public:
+    /** Constructor */
+    NEConvolutionRectangle() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionRectangle(const NEConvolutionRectangle &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionRectangle &operator=(const NEConvolutionRectangle &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolutionRectangle(NEConvolutionRectangle &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolutionRectangle &operator=(NEConvolutionRectangle &&) = delete;
+    /** Default destructor */
+    ~NEConvolutionRectangle();
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index e8b425b459..a061dc7b04 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,16 +26,15 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
+
 #include <memory>
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
 
 /** Basic function to simulate a convolution layer. This function calls one of the following NEON functions:
@@ -75,7 +74,16 @@ class NEConvolutionLayer : public IFunction
 public:
     /** Constructor */
     NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionLayer(const NEConvolutionLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionLayer &operator=(const NEConvolutionLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolutionLayer(NEConvolutionLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolutionLayer &operator=(NEConvolutionLayer &&) = delete;
+    /** Default destructor */
+    ~NEConvolutionLayer() = default;
     /** Set the input and output tensors.
      *
      * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -149,5 +157,5 @@ class NEConvolutionLayer : public IFunction
     std::shared_ptr<IMemoryManager> _memory_manager;
     std::unique_ptr<IFunction>      _function; /**< Function to run */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NECONVOLUTIONLAYER_H */
\ No newline at end of file
diff --git a/arm_compute/runtime/NEON/functions/NECopy.h b/arm_compute/runtime/NEON/functions/NECopy.h
index df1a49863a..a58ac9e620 100644
--- a/arm_compute/runtime/NEON/functions/NECopy.h
+++ b/arm_compute/runtime/NEON/functions/NECopy.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,11 +30,24 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NECopyKernel */
 class NECopy : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NECopy() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECopy(const NECopy &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECopy &operator=(const NECopy &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NECopy(NECopy &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NECopy &operator=(NECopy &&) = delete;
+    /** Default destructor */
+    ~NECopy();
     /** Initialise the function's source and destination.
      *
      * @param[in]  input  Source tensor. Data types supported: All
diff --git a/arm_compute/runtime/NEON/functions/NECropResize.h b/arm_compute/runtime/NEON/functions/NECropResize.h
index 361c236293..5c3733f8ee 100644
--- a/arm_compute/runtime/NEON/functions/NECropResize.h
+++ b/arm_compute/runtime/NEON/functions/NECropResize.h
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NEON_CROP_RESIZE_H
 #define ARM_COMPUTE_NEON_CROP_RESIZE_H
 
-#include "arm_compute/core/NEON/kernels/NECropKernel.h"
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
 
 #include <memory>
@@ -33,6 +32,7 @@ namespace arm_compute
 {
 // Forward Declarations
 class ITensor;
+class NECropKernel;
 
 /** Function to perform cropping and resizing */
 class NECropResize : public IFunction
@@ -49,7 +49,7 @@ class NECropResize : public IFunction
     /** Allow instances of this class to be moved */
     NECropResize &operator=(NECropResize &&) = default;
     /** Default destructor */
-    virtual ~NECropResize() = default;
+    ~NECropResize();
 
     /** Configure kernel
      *
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index 378fce70b3..97b1a47f64 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -82,10 +82,10 @@ class NEDeconvolutionLayer : public IFunction
     NEDeconvolutionLayer(const NEDeconvolutionLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDeconvolutionLayer &operator=(const NEDeconvolutionLayer &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDeconvolutionLayer(NEDeconvolutionLayer &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDeconvolutionLayer &operator=(NEDeconvolutionLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEDeconvolutionLayer(NEDeconvolutionLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEDeconvolutionLayer &operator=(NEDeconvolutionLayer &&) = delete;
     /** Default destructor */
     virtual ~NEDeconvolutionLayer() = default;
 
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
index 89f3958417..c9817a63c1 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
@@ -32,6 +32,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /**Basic function to run @ref NEDepthConvertLayerKernel */
 class NEDepthConvertLayer : public INESimpleFunctionNoBorder
@@ -43,6 +44,8 @@ class NEDepthConvertLayer : public INESimpleFunctionNoBorder
     NEDepthConvertLayer(const NEDepthConvertLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers)*/
     const NEDepthConvertLayer &operator=(const NEDepthConvertLayer &) = delete;
+    /** Default destructor */
+    ~NEDepthConvertLayer() = default;
     /** Initialize the function's source, destination
      *
      * Valid conversions Input -> Output :
diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
index 22bbd6e716..51f7ff7770 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,6 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
@@ -34,11 +33,24 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEDepthToSpaceLayerKernel. */
 class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEDepthToSpaceLayer() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthToSpaceLayer(const NEDepthToSpaceLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthToSpaceLayer &operator=(const NEDepthToSpaceLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEDepthToSpaceLayer(NEDepthToSpaceLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEDepthToSpaceLayer &operator=(NEDepthToSpaceLayer &&) = delete;
+    /** Default destructor */
+    ~NEDepthToSpaceLayer() = default;
     /** Set the input and output tensors.
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 116ac16ce7..dc70aec7ff 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -24,18 +24,16 @@
 #ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H
 #define ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H
 
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 #include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class NEDepthwiseConvolutionLayerNativeKernel;
 
 /** Function to execute a depthwise convolution.
  */
@@ -52,6 +50,8 @@ class NEDepthwiseConvolutionLayer : public IFunction
     NEDepthwiseConvolutionLayer &operator=(const NEDepthwiseConvolutionLayer &) = delete;
     /** Default move assignment operator */
     NEDepthwiseConvolutionLayer &operator=(NEDepthwiseConvolutionLayer &&) = default;
+    /** Default destructor */
+    ~NEDepthwiseConvolutionLayer();
     /** Initialize the function's source, destination, weights and convolution information.
      *
      * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
@@ -134,6 +134,8 @@ class NEDepthwiseConvolutionLayer : public IFunction
         NEDepthwiseConvolutionLayerOptimizedInternal &operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete;
         /** Default move assignment operator */
         NEDepthwiseConvolutionLayerOptimizedInternal &operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
+        /** Default destructor */
+        ~NEDepthwiseConvolutionLayerOptimizedInternal() = default;
         /** Initialize the function's source, destination, kernels and border_size.
          *
          * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
@@ -171,60 +173,23 @@ class NEDepthwiseConvolutionLayer : public IFunction
         void prepare() override;
 
     private:
-        /** Configure the kernels/functions for the generic pipeline.
-         *
-         * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
-         * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                                  Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[out]     output           Destination tensor. Data type supported: same as @p input.
-         * @param[in]      conv_info        Padding and stride information to use for the convolution.
-         * @param[in]      depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-         * @param[in]      act_info         Activation layer information in case of a fused activation.
-         * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-         *
-         */
-        void configure_generic(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation = Size2D(1U, 1U));
-        /** Configure the kernels/functions for the optimized pipeline.
-         *
-         * @param[in]  input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in]  weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
-         * @param[in]  biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                              Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[out] output           Destination tensor. Data type supported: same as @p input.
-         * @param[in]  conv_info        Padding and stride information to use for the convolution.
-         * @param[in]  depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-         * @param[in]  act_info         Activation layer information in case of a fused activation.
-         */
-        void configure_optimized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation = Size2D(1U, 1U));
-        /** Run generic kernel */
-        void run_generic();
-        /** Run optimized function */
-        void run_optimized();
-
-        MemoryGroup                               _memory_group;
-        NEDepthwiseConvolutionLayer3x3Kernel      _dwc_kernel;
-        NEDepthwiseConvolutionAssemblyDispatch    _dwc_optimized_func;
-        NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel;
-        NEFillBorderKernel                        _border_handler;
-        NEPermute                                 _permute_input;
-        NEPermute                                 _permute_weights;
-        NEPermute                                 _permute_output;
-        NEActivationLayer                         _activationlayer_function;
-        Tensor                                    _accumulator;
-        Tensor                                    _permuted_input;
-        Tensor                                    _permuted_weights;
-        Tensor                                    _permuted_output;
-        const ITensor                            *_original_weights;
-        bool                                      _has_bias;
-        bool                                      _is_quantized;
-        bool                                      _is_optimized;
-        bool                                      _is_nchw;
-        bool                                      _permute;
-        bool                                      _is_activationlayer_enabled;
-        bool                                      _is_prepared;
+        MemoryGroup                            _memory_group;
+        NEDepthwiseConvolutionAssemblyDispatch _dwc_optimized_func;
+        NEPermute                              _permute_input;
+        NEPermute                              _permute_weights;
+        NEPermute                              _permute_output;
+        NEActivationLayer                      _activationlayer_function;
+        Tensor                                 _accumulator;
+        Tensor                                 _permuted_input;
+        Tensor                                 _permuted_weights;
+        Tensor                                 _permuted_output;
+        const ITensor                         *_original_weights;
+        bool                                   _has_bias;
+        bool                                   _is_quantized;
+        bool                                   _is_nchw;
+        bool                                   _permute;
+        bool                                   _is_activationlayer_enabled;
+        bool                                   _is_prepared;
     };
 
     /** Basic function to execute a generic depthwise convolution. This function calls the following NEON kernel:
@@ -245,6 +210,8 @@ class NEDepthwiseConvolutionLayer : public IFunction
         NEDepthwiseConvolutionLayerGeneric &operator=(const NEDepthwiseConvolutionLayerGeneric &) = delete;
         /** Default move assignment operator */
         NEDepthwiseConvolutionLayerGeneric &operator=(NEDepthwiseConvolutionLayerGeneric &&) = default;
+        /** Default destructor */
+        ~NEDepthwiseConvolutionLayerGeneric() = default;
         /** Initialize the function's source, destination, weights and convolution information.
          *
          * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
@@ -284,19 +251,18 @@ class NEDepthwiseConvolutionLayer : public IFunction
         void prepare() override;
 
     private:
-        NEDepthwiseConvolutionLayerNativeKernel _depthwise_conv_kernel;
-        NEFillBorderKernel                      _fill_border;
-        NEPermute                               _permute_input;
-        NEPermute                               _permute_weights;
-        NEPermute                               _permute_output;
-        NEActivationLayer                       _activationlayer_function;
-        Tensor                                  _permuted_input;
-        Tensor                                  _permuted_weights;
-        Tensor                                  _permuted_output;
-        bool                                    _is_prepared;
-        bool                                    _is_nchw;
-        bool                                    _is_activationlayer_enabled;
-        const ITensor                          *_original_weights;
+        std::unique_ptr<NEDepthwiseConvolutionLayerNativeKernel> _depthwise_conv_kernel;
+        NEPermute                                                _permute_input;
+        NEPermute                                                _permute_weights;
+        NEPermute                                                _permute_output;
+        NEActivationLayer                                        _activationlayer_function;
+        Tensor                                                   _permuted_input;
+        Tensor                                                   _permuted_weights;
+        Tensor                                                   _permuted_output;
+        bool                                                     _is_prepared;
+        bool                                                     _is_nchw;
+        bool                                                     _is_activationlayer_enabled;
+        const ITensor                                           *_original_weights;
     };
 
     DepthwiseConvolutionFunction                 _depth_conv_func;
diff --git a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
index 77295bc089..f52d709c74 100644
--- a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
@@ -32,6 +32,7 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEDequantizationLayerKernel that dequantizes an input tensor */
 class NEDequantizationLayer : public INESimpleFunctionNoBorder
diff --git a/arm_compute/runtime/NEON/functions/NEDerivative.h b/arm_compute/runtime/NEON/functions/NEDerivative.h
index 8eb21425ac..b14e38a23a 100644
--- a/arm_compute/runtime/NEON/functions/NEDerivative.h
+++ b/arm_compute/runtime/NEON/functions/NEDerivative.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,28 +24,40 @@
 #ifndef ARM_COMPUTE_NEDERIVATIVE_H
 #define ARM_COMPUTE_NEDERIVATIVE_H
 
-#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class NEDerivativeKernel;
+class NEFillBorderKernel;
 
 /** Basic function to execute first order derivative operator. This function calls the following NEON kernels:
  *
  * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref NEDerivativeKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEDerivative : public IFunction
 {
 public:
     /** Default constructor */
     NEDerivative();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDerivative(const NEDerivative &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDerivative &operator=(const NEDerivative &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEDerivative(NEDerivative &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEDerivative &operator=(NEDerivative &&) = delete;
+    /** Default destructor */
+    ~NEDerivative();
     /** Initialise the function's source, destinations and border mode.
      *
      * @note At least one of output_x or output_y must be not NULL.
@@ -63,8 +75,8 @@ class NEDerivative : public IFunction
     void run() override;
 
 private:
-    NEDerivativeKernel _kernel;         /**< Derivative kernel */
-    NEFillBorderKernel _border_handler; /**< Kernel to handle tensor borders */
+    std::unique_ptr<NEDerivativeKernel> _kernel;         /**< Derivative kernel */
+    std::unique_ptr<NEFillBorderKernel> _border_handler; /**< Kernel to handle tensor borders */
 };
 }
 #endif /* ARM_COMPUTE_NEDERIVATIVE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
index e0431b2b31..d5c1f0ab6f 100644
--- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
@@ -53,6 +53,8 @@ class NEDetectionPostProcessLayer : public IFunction
     NEDetectionPostProcessLayer(const NEDetectionPostProcessLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDetectionPostProcessLayer &operator=(const NEDetectionPostProcessLayer &) = delete;
+    /** Default destructor */
+    ~NEDetectionPostProcessLayer() = default;
     /** Configure the detection output layer NE function
      *
      * @param[in]  input_box_encoding The bounding box input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32.
diff --git a/arm_compute/runtime/NEON/functions/NEDilate.h b/arm_compute/runtime/NEON/functions/NEDilate.h
index 6dae2c7029..1f2bcb50ea 100644
--- a/arm_compute/runtime/NEON/functions/NEDilate.h
+++ b/arm_compute/runtime/NEON/functions/NEDilate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,8 @@ class ITensor;
  * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref NEDilateKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEDilate : public INESimpleFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 9b18f645bd..5b6ed55be2 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,6 @@
 #ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -38,6 +35,10 @@
 
 namespace arm_compute
 {
+class NEDirectConvolutionLayerOutputStageKernel;
+class NEDirectConvolutionLayerKernel;
+class NEFillBorderKernel;
+
 /** Function to run the direct convolution.
  *
  *  This function calls the following NEON kernels:
@@ -51,6 +52,16 @@ class NEDirectConvolutionLayer : public IFunction
 public:
     /** Constructor */
     NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayer(const NEDirectConvolutionLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayer &operator=(const NEDirectConvolutionLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEDirectConvolutionLayer(NEDirectConvolutionLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEDirectConvolutionLayer &operator=(NEDirectConvolutionLayer &&) = delete;
+    /** Default destructor */
+    ~NEDirectConvolutionLayer();
     /** Set the input, weights, biases and output tensors.
      *
      * @note: DirectConvolution only works in the following configurations:
@@ -97,15 +108,16 @@ class NEDirectConvolutionLayer : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                               _memory_group;
-    NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel;
-    NEDirectConvolutionLayerKernel            _conv_kernel;
-    NEFillBorderKernel                        _input_border_handler;
-    NEActivationLayer                         _activationlayer_function;
-    Tensor                                    _accumulator;
-    bool                                      _has_bias;
-    bool                                      _is_activationlayer_enabled;
-    unsigned int                              _dim_split;
+    MemoryGroup                                                _memory_group;
+    std::unique_ptr<NEDirectConvolutionLayerOutputStageKernel> _output_stage_kernel;
+    std::unique_ptr<NEDirectConvolutionLayerKernel>            _conv_kernel;
+    std::unique_ptr<NEFillBorderKernel>                        _input_border_handler;
+    NEActivationLayer                                          _activationlayer_function;
+    Tensor                                                     _accumulator;
+    bool                                                       _has_bias;
+    bool                                                       _is_activationlayer_enabled;
+    unsigned int                                               _dim_split;
+    bool                                                       _is_padding_required;
 };
 }
 #endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
index 7d9dac761f..5c755e96ac 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
@@ -270,7 +270,7 @@ class NEElementwisePower : public IFunction
 
 /** Basic function to run @ref NEComparisonOperationKernel.
  *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a comparison operation between two tensors.
  */
 class NEElementwiseComparison : public IFunction
@@ -290,7 +290,7 @@ class NEElementwiseComparison : public IFunction
     NEElementwiseComparison &operator=(NEElementwiseComparison &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
      * @param[out]     output Output tensor. Data types supported: U8.
      * @param[in]      op     Comparison Operation to be performed.
@@ -298,7 +298,7 @@ class NEElementwiseComparison : public IFunction
     void configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op);
     /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
      *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
      * @param[in] output Output tensor info. Data types supported: U8.
      * @param[in] op     Comparison Operation to be performed.
@@ -317,7 +317,7 @@ class NEElementwiseComparison : public IFunction
 
 /** Basic function to run @ref NEComparisonOperationKernel
  *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a comparison operation between two tensors.
  */
 template <ComparisonOperation op>
@@ -338,14 +338,14 @@ class NEElementwiseComparisonStatic : public IFunction
     NEElementwiseComparisonStatic &operator=(NEElementwiseComparisonStatic &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
      * @param[out]     output Output tensor. Data types supported: U16/U32.
      */
     void configure(ITensor *input1, ITensor *input2, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
      *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
      * @param[in] output Output tensor info. Data types supported: U16/U32.
      *
@@ -456,22 +456,22 @@ class NEElementwiseSquaredDiff : public INEOperator
 
 /** Basic function to run @ref NEArithmeticOperationKernel for division
  *
- * @note The tensor data type for the inputs must be F16/F32.
- * @note The function performs a squared different operation between two tensors (i.e., out[i] = in1[i] / in2[i])
+ * @note The tensor data type for the inputs must be S32/F16/F32.
+ * @note The function performs a division operation between two tensors (i.e., out[i] = in1[i] / in2[i])
  */
 class NEElementwiseDivision : public INEOperator
 {
 public:
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in, out] input1 First tensor input info. Data types supported: S32/F16/F32.
      * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
      * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
      */
     void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for division
      *
-     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in] input1 First tensor input info. Data types supported: S32/F16/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
      * @param[in] output Output tensor info. Data types supported: Same as @p input1.
      *
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h b/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
index 8b3301889a..46a7316705 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
@@ -24,11 +24,13 @@
 #ifndef ARM_COMPUTE_NEELEMENTWISEUNARYLAYER_H
 #define ARM_COMPUTE_NEELEMENTWISEUNARYLAYER_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to perform inverse square root on an input tensor. */
 class NERsqrtLayer : public INESimpleFunctionNoBorder
diff --git a/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h b/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h
index 5c0c323591..e81b4ce33a 100644
--- a/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h
+++ b/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,6 @@
 #ifndef ARM_COMPUTE_NEEQUALIZEHISTOGRAM_H
 #define ARM_COMPUTE_NEEQUALIZEHISTOGRAM_H
 
-#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
-#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
 #include "arm_compute/runtime/Distribution1D.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/Lut.h"
@@ -36,6 +33,9 @@
 namespace arm_compute
 {
 class ITensor;
+class NEHistogramKernel;
+class NECumulativeDistributionKernel;
+class NETableLookupKernel;
 using IImage = ITensor;
 
 /** Basic function to execute histogram equalization. This function calls the following NEON kernels:
@@ -44,12 +44,24 @@ using IImage = ITensor;
  * -# @ref NECumulativeDistributionKernel
  * -# @ref NETableLookupKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEEqualizeHistogram : public IFunction
 {
 public:
     /** Default Constructor. */
     NEEqualizeHistogram();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEqualizeHistogram(const NEEqualizeHistogram &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEqualizeHistogram &operator=(const NEEqualizeHistogram &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEEqualizeHistogram(NEEqualizeHistogram &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEEqualizeHistogram &operator=(NEEqualizeHistogram &&) = delete;
+    /** Default destructor */
+    ~NEEqualizeHistogram();
     /** Initialise the kernel's inputs.
      *
      * @note Currently the width of the input image must be a multiple of 16.
@@ -63,15 +75,15 @@ class NEEqualizeHistogram : public IFunction
     void run() override;
 
 private:
-    NEHistogramKernel              _histogram_kernel;        /**< Kernel that calculates the histogram of input. */
-    NECumulativeDistributionKernel _cd_histogram_kernel;     /**< Kernel that calculates the cumulative distribution
+    std::unique_ptr<NEHistogramKernel>              _histogram_kernel;        /**< Kernel that calculates the histogram of input. */
+    std::unique_ptr<NECumulativeDistributionKernel> _cd_histogram_kernel;     /**< Kernel that calculates the cumulative distribution
                                                                   and creates the relevant LookupTable. */
-    NETableLookupKernel            _map_histogram_kernel;    /**< Kernel that maps the input to output using the lut. */
-    Distribution1D                 _hist;                    /**< Distribution that holds the histogram of the input image. */
-    Distribution1D                 _cum_dist;                /**< Distribution that holds the cummulative distribution of the input histogram. */
-    Lut                            _cd_lut;                  /**< Holds the equalization lookuptable. */
-    static constexpr uint32_t      nr_bins{ 256 };           /**< Histogram bins of the internal histograms. */
-    static constexpr uint32_t      max_range{ nr_bins - 1 }; /**< Histogram range of the internal histograms. */
+    std::unique_ptr<NETableLookupKernel>            _map_histogram_kernel;    /**< Kernel that maps the input to output using the lut. */
+    Distribution1D                                  _hist;                    /**< Distribution that holds the histogram of the input image. */
+    Distribution1D                                  _cum_dist;                /**< Distribution that holds the cummulative distribution of the input histogram. */
+    Lut                                             _cd_lut;                  /**< Holds the equalization lookuptable. */
+    static constexpr uint32_t                       nr_bins{ 256 };           /**< Histogram bins of the internal histograms. */
+    static constexpr uint32_t                       max_range{ nr_bins - 1 }; /**< Histogram range of the internal histograms. */
 };
 }
 #endif /*ARM_COMPUTE_NEEQUALIZEHISTOGRAM_H */
diff --git a/arm_compute/runtime/NEON/functions/NEErode.h b/arm_compute/runtime/NEON/functions/NEErode.h
index 3e84c2b758..b81da4e5b6 100644
--- a/arm_compute/runtime/NEON/functions/NEErode.h
+++ b/arm_compute/runtime/NEON/functions/NEErode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,8 @@ class ITensor;
  * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref NEErodeKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEErode : public INESimpleFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h
index 312b46b10f..4b6cc3fd18 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT1D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,19 +24,21 @@
 #ifndef ARM_COMPUTE_NEFFT1D_H
 #define ARM_COMPUTE_NEFFT1D_H
 
-#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTScaleKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 // Forward declaration
 class ITensor;
+class NEFFTDigitReverseKernel;
+class NEFFTRadixStageKernel;
+class NEFFTScaleKernel;
 
 /** Basic function to execute one dimensional FFT. This function calls the following NEON kernels:
  *
@@ -49,6 +51,16 @@ class NEFFT1D : public IFunction
 public:
     /** Default Constructor */
     NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFT1D(const NEFFT1D &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFT1D &operator=(const NEFFT1D &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFFT1D(NEFFT1D &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFFT1D &operator=(NEFFT1D &&) = delete;
+    /** Default destructor */
+    ~NEFFT1D();
     /** Initialise the function's source and destinations.
      *
      * @param[in]  input  Source tensor. Data types supported: F32. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
@@ -71,15 +83,15 @@ class NEFFT1D : public IFunction
     void run() override;
 
 protected:
-    MemoryGroup                        _memory_group;
-    NEFFTDigitReverseKernel            _digit_reverse_kernel;
-    std::vector<NEFFTRadixStageKernel> _fft_kernels;
-    NEFFTScaleKernel                   _scale_kernel;
-    Tensor                             _digit_reversed_input;
-    Tensor                             _digit_reverse_indices;
-    unsigned int                       _num_ffts;
-    unsigned int                       _axis;
-    bool                               _run_scale;
+    MemoryGroup                                         _memory_group;
+    std::unique_ptr<NEFFTDigitReverseKernel>            _digit_reverse_kernel;
+    std::vector<std::unique_ptr<NEFFTRadixStageKernel>> _fft_kernels;
+    std::unique_ptr<NEFFTScaleKernel>                   _scale_kernel;
+    Tensor                                              _digit_reversed_input;
+    Tensor                                              _digit_reverse_indices;
+    unsigned int                                        _num_ffts;
+    unsigned int                                        _axis;
+    bool                                                _run_scale;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEFFT1D_H */
diff --git a/arm_compute/runtime/NEON/functions/NEFFT2D.h b/arm_compute/runtime/NEON/functions/NEFFT2D.h
index efcce2e9a4..18e72c1a2f 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT2D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT2D.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,6 +46,16 @@ class NEFFT2D : public IFunction
 public:
     /** Default Constructor */
     NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFT2D(const NEFFT2D &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFT2D &operator=(const NEFFT2D &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFFT2D(NEFFT2D &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFFT2D &operator=(NEFFT2D &&) = delete;
+    /** Default destructor */
+    ~NEFFT2D();
     /** Initialise the function's source and destinations
      *
      * @param[in]  input  Source tensor. Data types supported: F32.
diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
index dd57900f2a..37750e243b 100644
--- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,12 +63,14 @@ class NEFFTConvolutionLayer : public IFunction
     NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFFTConvolutionLayer(const NEFFTConvolutionLayer &) = delete;
-    /** Default move constructor */
-    NEFFTConvolutionLayer(NEFFTConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFFTConvolutionLayer(NEFFTConvolutionLayer &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFFTConvolutionLayer &operator=(const NEFFTConvolutionLayer &) = delete;
-    /** Default move assignment operator */
-    NEFFTConvolutionLayer &operator=(NEFFTConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFFTConvolutionLayer &operator=(NEFFTConvolutionLayer &&) = delete;
+    /** Default destructor */
+    ~NEFFTConvolutionLayer();
     /** Set the input and output tensors.
      *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
diff --git a/arm_compute/runtime/NEON/functions/NEFastCorners.h b/arm_compute/runtime/NEON/functions/NEFastCorners.h
index cc69e77ebb..e86a87eb7e 100644
--- a/arm_compute/runtime/NEON/functions/NEFastCorners.h
+++ b/arm_compute/runtime/NEON/functions/NEFastCorners.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,6 @@
 #ifndef ARM_COMPUTE_NEFASTCORNERS_H
 #define ARM_COMPUTE_NEFASTCORNERS_H
 
-#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Array.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -41,6 +37,10 @@
 namespace arm_compute
 {
 class ITensor;
+class NENonMaximaSuppression3x3Kernel;
+class NEFastCornersKernel;
+class NEFillBorderKernel;
+class NEFillArrayKernel;
 using IImage = ITensor;
 
 /** Basic function to execute fast corners. This function call the following NEON kernels:
@@ -49,12 +49,24 @@ using IImage = ITensor;
  * -# @ref NENonMaximaSuppression3x3Kernel (executed if nonmax_suppression == true)
  * -# @ref NEFillArrayKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEFastCorners : public IFunction
 {
 public:
     /** Constructor */
     NEFastCorners(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFastCorners(const NEFastCorners &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFastCorners &operator=(const NEFastCorners &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFastCorners(NEFastCorners &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFastCorners &operator=(NEFastCorners &&) = delete;
+    /** Default destructor */
+    ~NEFastCorners();
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in, out] input                 Source image. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -71,14 +83,14 @@ class NEFastCorners : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                     _memory_group;
-    NEFastCornersKernel             _fast_corners_kernel;
-    NEFillBorderKernel              _border_handler;
-    NENonMaximaSuppression3x3Kernel _nonmax_kernel;
-    NEFillArrayKernel               _fill_kernel;
-    Image                           _output;
-    Image                           _suppressed;
-    bool                            _non_max;
+    MemoryGroup                                      _memory_group;
+    std::unique_ptr<NEFastCornersKernel>             _fast_corners_kernel;
+    std::unique_ptr<NEFillBorderKernel>              _border_handler;
+    std::unique_ptr<NENonMaximaSuppression3x3Kernel> _nonmax_kernel;
+    std::unique_ptr<NEFillArrayKernel>               _fill_kernel;
+    Image                                            _output;
+    Image                                            _suppressed;
+    bool                                             _non_max;
 };
 }
 #endif /*ARM_COMPUTE_NEFASTCORNERS_H */
diff --git a/arm_compute/runtime/NEON/functions/NEFill.h b/arm_compute/runtime/NEON/functions/NEFill.h
index 1c3c546c68..14d690f419 100644
--- a/arm_compute/runtime/NEON/functions/NEFill.h
+++ b/arm_compute/runtime/NEON/functions/NEFill.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NEFILL_H
 #define ARM_COMPUTE_NEFILL_H
 
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h
index 3ac23be731..e9a08ef7ec 100644
--- a/arm_compute/runtime/NEON/functions/NEFillBorder.h
+++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,15 +24,16 @@
 #ifndef ARM_COMPUTE_NEFILLBORDER_H
 #define ARM_COMPUTE_NEFILLBORDER_H
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declaration
 class ITensor;
+class NEFillBorderKernel;
 
 /** Basic function to run @ref NEFillBorderKernel */
 class NEFillBorder : public IFunction
@@ -53,7 +54,7 @@ class NEFillBorder : public IFunction
     void run() override;
 
 private:
-    NEFillBorderKernel _border_handler; /**< Kernel to handle image borders */
+    std::unique_ptr<NEFillBorderKernel> _border_handler; /**< Kernel to handle image borders */
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEFILLBORDER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEFlattenLayer.h b/arm_compute/runtime/NEON/functions/NEFlattenLayer.h
index 73da254ef5..9f0d5226de 100644
--- a/arm_compute/runtime/NEON/functions/NEFlattenLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFlattenLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to execute flatten layer kernel. */
 class NEFlattenLayer : public INESimpleFunctionNoBorder
diff --git a/arm_compute/runtime/NEON/functions/NEFloor.h b/arm_compute/runtime/NEON/functions/NEFloor.h
index 12f0ee20ba..7f4248eadb 100644
--- a/arm_compute/runtime/NEON/functions/NEFloor.h
+++ b/arm_compute/runtime/NEON/functions/NEFloor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEFloorKernel */
 class NEFloor : public INESimpleFunctionNoBorder
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 21df3c4aef..0a7748a94b 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -26,25 +26,36 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
 {
+class NEFlattenLayerKernel;
+
 /** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls the following kernels:
- *
- *  -# @ref NETransposeKernel
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
  */
 class NEFullyConnectedLayerReshapeWeights : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEFullyConnectedLayerReshapeWeights() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFullyConnectedLayerReshapeWeights(const NEFullyConnectedLayerReshapeWeights &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFullyConnectedLayerReshapeWeights &operator=(const NEFullyConnectedLayerReshapeWeights &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFullyConnectedLayerReshapeWeights(NEFullyConnectedLayerReshapeWeights &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFullyConnectedLayerReshapeWeights &operator=(NEFullyConnectedLayerReshapeWeights &&) = delete;
+    /** Default destructor */
+    ~NEFullyConnectedLayerReshapeWeights() = default;
     /** Set the input and output tensors.
      *
      * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -116,12 +127,14 @@ class NEFullyConnectedLayer : public IFunction
     NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFullyConnectedLayer(const NEFullyConnectedLayer &) = delete;
-    /** Default move constructor */
-    NEFullyConnectedLayer(NEFullyConnectedLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEFullyConnectedLayer(NEFullyConnectedLayer &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFullyConnectedLayer &operator=(const NEFullyConnectedLayer &) = delete;
-    /** Default move assignment operator */
-    NEFullyConnectedLayer &operator=(NEFullyConnectedLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEFullyConnectedLayer &operator=(NEFullyConnectedLayer &&) = delete;
+    /** Default destructor */
+    ~NEFullyConnectedLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input   Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -168,7 +181,7 @@ class NEFullyConnectedLayer : public IFunction
 
     MemoryGroup                                                         _memory_group;
     IWeightsManager                                                    *_weights_manager;
-    NEFlattenLayerKernel                                                _flatten_kernel;
+    std::unique_ptr<NEFlattenLayerKernel>                               _flatten_kernel;
     NEConvertFullyConnectedWeights                                      _convert_weights;
     weights_transformations::NEConvertFullyConnectedWeightsManaged      _convert_weights_managed;
     NEFullyConnectedLayerReshapeWeights                                 _reshape_weights_function;
diff --git a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
index 6b561352a6..5dc804e240 100644
--- a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
+++ b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_NEFUSEBATCHNORMALIZATION_H
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
@@ -33,6 +32,7 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class NEFuseBatchNormalizationKernel;
 
 /** Basic function to fuse the batch normalization node to a preceding convolution node */
 class NEFuseBatchNormalization : public IFunction
@@ -49,7 +49,7 @@ class NEFuseBatchNormalization : public IFunction
     /** Allow instances of this class to be moved */
     NEFuseBatchNormalization &operator=(NEFuseBatchNormalization &&) = default;
     /** Default destructor */
-    ~NEFuseBatchNormalization() = default;
+    ~NEFuseBatchNormalization();
     /** Set the input and output tensors.
      *
      * @param[in]  input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -94,7 +94,7 @@ class NEFuseBatchNormalization : public IFunction
     void run() override;
 
 private:
-    NEFuseBatchNormalizationKernel _fuse_bn_kernel;
+    std::unique_ptr<NEFuseBatchNormalizationKernel> _fuse_bn_kernel;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEFUSEBATCHNORMALIZATION_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index 8d65fb5303..645ab56417 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -24,11 +24,6 @@
 #ifndef ARM_COMPUTE_NEGEMM_H
 #define ARM_COMPUTE_NEGEMM_H
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
@@ -38,8 +33,14 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class NEGEMMInterleave4x4Kernel;
+class NEGEMMMatrixAdditionKernel;
+class NEGEMMMatrixMultiplyKernel;
+class NEGEMMTranspose1xWKernel;
 /** Basic function to execute GEMM on NEON. This function calls the following NEON kernels:
  *
  * If optimized assembly is available:
@@ -69,6 +70,8 @@ class NEGEMM : public IFunction
     NEGEMM &operator=(const NEGEMM &) = delete;
     /** Default move assignment operator */
     NEGEMM &operator=(NEGEMM &&) = default;
+    /** Default destructor */
+    ~NEGEMM();
     /** Initialise the kernel's inputs, output
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -104,16 +107,16 @@ class NEGEMM : public IFunction
     void prepare() override;
 
 private:
-    MemoryGroup                _memory_group;
-    IWeightsManager           *_weights_manager;
-    NEGEMMInterleave4x4Kernel  _interleave_kernel;
-    NEGEMMTranspose1xWKernel   _transpose_kernel;
-    NEGEMMMatrixMultiplyKernel _mm_kernel;
-    NEGEMMAssemblyDispatch     _asm_glue;
-    NEGEMMMatrixAdditionKernel _ma_kernel;
-    NEActivationLayer          _alpha_scale_func;
-    NEArithmeticAddition       _add_bias;
-    NEActivationLayer          _activation_func;
+    MemoryGroup                                 _memory_group;
+    IWeightsManager                            *_weights_manager;
+    std::unique_ptr<NEGEMMInterleave4x4Kernel>  _interleave_kernel;
+    std::unique_ptr<NEGEMMTranspose1xWKernel>   _transpose_kernel;
+    std::unique_ptr<NEGEMMMatrixMultiplyKernel> _mm_kernel;
+    NEGEMMAssemblyDispatch                      _asm_glue;
+    std::unique_ptr<NEGEMMMatrixAdditionKernel> _ma_kernel;
+    NEActivationLayer                           _alpha_scale_func;
+    NEArithmeticAddition                        _add_bias;
+    NEActivationLayer                           _activation_func;
 
     Tensor         _tmp_a;
     Tensor         _tmp_b;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
index a82d44fde8..8f9498d0f5 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
@@ -32,6 +32,28 @@
 
 namespace arm_compute
 {
+/* Convolution method supported by the assembly gemm interface */
+enum class AsmConvMethod
+{
+    Im2Col,
+    Indirect,
+    Conv
+};
+
+struct AsmGemmInfo
+{
+    AsmConvMethod           method{ AsmConvMethod::Im2Col };
+    PadStrideInfo           ps_info{};
+    ActivationLayerInfo     activation_info{};
+    GEMMLowpOutputStageInfo output_stage{};
+    bool                    negated_offsets{ true };
+    bool                    reinterpret_input_as_3d{ false };
+    bool                    depth_output_gemm3d{ false };
+    int64_t                 padding_top{ 0 };
+    int64_t                 padding_left{ 0 };
+    float                   padding_value{ 0.f };
+};
+
 /** Assembly kernel glue */
 class NEGEMMAssemblyDispatch : public IFunction
 {
@@ -55,33 +77,28 @@ class NEGEMMAssemblyDispatch : public IFunction
         virtual ~IFallback()               = default;
     };
 
-private:
-    /** Interface for the arm_gemm fallback */
-    std::unique_ptr<IFallback> _arm_gemm;
-    MemoryGroup                _memory_group;    /**< Function memory group */
-    IWeightsManager           *_weights_manager; /**< Pointer to the weights manager */
 public:
-    /** If supported create an ACL function else fallback to the arm_gemm function.
+    /** If supported create a Compute Library function else fallback to the arm_gemm function.
      *
-     * @param[in]  a         Input tensor (Matrix A)
-     * @param[in]  b         Input tensor (Matrix B)
-     * @param[in]  c         Input tensor (Matrix C) used to pass the bias for quantized calculations
-     * @param[out] d         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in]  gemm_info GEMM meta-data
+     * @param[in]  a    Input tensor (Matrix A)
+     * @param[in]  b    Input tensor (Matrix B)
+     * @param[in]  c    Input tensor (Matrix C) used to pass the bias for quantized calculations
+     * @param[out] d    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+     * @param[in]  info GEMM meta-data
      */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info);
+    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info);
 
     /** Indicates whether or not this function can be used to process the given parameters.
      *
-     * @param[in] a         Input tensor info (Matrix A)
-     * @param[in] b         Input tensor info (Matrix B)
-     * @param[in] c         Input tensor info (Matrix C) used to pass the bias for quantized calculations
-     * @param[in] d         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in] gemm_info GEMM meta-data
+     * @param[in] a    Input tensor info (Matrix A)
+     * @param[in] b    Input tensor info (Matrix B)
+     * @param[in] c    Input tensor info (Matrix C) used to pass the bias for quantized calculations
+     * @param[in] d    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+     * @param[in] info GEMM meta-data
      *
      * @return a status.
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info);
+    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
     /** Checks if activation is supported by the gemm assembly dispatcher
      *
      * @param[in] activation Activation to check
@@ -94,10 +111,15 @@ class NEGEMMAssemblyDispatch : public IFunction
      * @return True if the function is configured and ready to run
      */
     bool is_configured() const;
+
     // Inherited methods overridden:
-    /** Runs a preparation step, usually for pre-transposing matrix b */
     void prepare() override;
     void run() override;
+
+private:
+    std::unique_ptr<IFallback> _arm_gemm;        /** Interface for the arm_gemm fallback */
+    MemoryGroup                _memory_group;    /**< Function memory group */
+    IWeightsManager           *_weights_manager; /**< Pointer to the weights manager */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEGEMMASSEMBLYDISPATCH_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
new file mode 100644
index 0000000000..7cae39397f
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMCONV2D_H
+#define ARM_COMPUTE_NEGEMMCONV2D_H
+
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+/** Basic function to compute the convolution layer. This function calls the following NEON kernels/functions:
+ *
+ * Supports only NHWC data layout
+ *
+ * -# @ref NEGEMMAssemblyDispatch
+ * -# @ref NEActivationLayer, in case activation cannot be fused in the assembly dispatch
+ *
+ * Weights are transformed from OHWI to HWIO format using the following kernels:
+ * -# @ref NEPermute
+ */
+class NEGEMMConv2d : public IFunction
+{
+public:
+    /** Constructor */
+    NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMConv2d(const NEGEMMConv2d &) = delete;
+    /** Default move constructor */
+    NEGEMMConv2d(NEGEMMConv2d &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMConv2d &operator=(const NEGEMMConv2d &) = delete;
+    /** Default move assignment operator */
+    NEGEMMConv2d &operator=(NEGEMMConv2d &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                     while every optional dimension from 4 and above represent a batch of inputs.
+     *                     Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                     Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in]  biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                     Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] output  Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                     Data types supported: Same as @p input.
+     * @param[in]  info    Convolution layer descriptor
+     */
+    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConv2d
+     *
+     * @param[in] input   Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                    while every optional dimension from 4 and above represent a batch of inputs.
+     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                    Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in] biases  Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                    Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[in] output  Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                    Data types supported: Same as @p input.
+     * @param[in] info    Contains padding and stride information described in @ref PadStrideInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info);
+
+    // Inherited methods overridden:
+    void run() override;
+    void prepare() override;
+
+private:
+    NEGEMMAssemblyDispatch _gemm_asm_func;
+    NEActivationLayer      _activation_func;
+    NEPermute              _weights_permute_func;
+    const ITensor         *_original_weights;
+    Tensor                 _permuted_weights;
+    bool                   _is_prepared;
+    bool                   _run_activation;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEGEMMCONV2D_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index b3f5c51010..59d83ed68d 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -26,10 +26,6 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -44,6 +40,9 @@
 namespace arm_compute
 {
 class ITensor;
+class NECol2ImKernel;
+class NEIm2ColKernel;
+class NEWeightsReshapeKernel;
 
 /** Function to reshape the weights. This function calls the following kernel:
  * -# @ref NEWeightsReshapeKernel
@@ -55,12 +54,14 @@ class NEConvolutionLayerReshapeWeights : public IFunction
     NEConvolutionLayerReshapeWeights();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEConvolutionLayerReshapeWeights(const NEConvolutionLayerReshapeWeights &) = delete;
-    /** Default move constructor */
-    NEConvolutionLayerReshapeWeights(NEConvolutionLayerReshapeWeights &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolutionLayerReshapeWeights(NEConvolutionLayerReshapeWeights &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEConvolutionLayerReshapeWeights &operator=(const NEConvolutionLayerReshapeWeights &) = delete;
-    /** Default move assignment operator */
-    NEConvolutionLayerReshapeWeights &operator=(NEConvolutionLayerReshapeWeights &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolutionLayerReshapeWeights &operator=(NEConvolutionLayerReshapeWeights &&) = delete;
+    /** Default destructor */
+    ~NEConvolutionLayerReshapeWeights();
     /** Set the input and output tensors.
      *
      * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
@@ -88,7 +89,7 @@ class NEConvolutionLayerReshapeWeights : public IFunction
     void run() override;
 
 private:
-    NEWeightsReshapeKernel _weights_reshape_kernel;
+    std::unique_ptr<NEWeightsReshapeKernel> _weights_reshape_kernel;
 };
 
 namespace weights_transformations
@@ -97,6 +98,18 @@ namespace weights_transformations
 class NEConvolutionLayerReshapeWeightsTransform : public ITransformWeights
 {
 public:
+    /** Constructor */
+    NEConvolutionLayerReshapeWeightsTransform() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionLayerReshapeWeightsTransform(const NEConvolutionLayerReshapeWeightsTransform &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionLayerReshapeWeightsTransform &operator=(const NEConvolutionLayerReshapeWeightsTransform &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolutionLayerReshapeWeightsTransform(NEConvolutionLayerReshapeWeightsTransform &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolutionLayerReshapeWeightsTransform &operator=(NEConvolutionLayerReshapeWeightsTransform &&) = delete;
+    /** Default destructor */
+    ~NEConvolutionLayerReshapeWeightsTransform() = default;
     void configure(const ITensor *input, const ITensor *biases)
     {
         _bias_bit = (biases != nullptr) ? 1 : 0;
@@ -154,12 +167,14 @@ class NEGEMMConvolutionLayer : public IFunction
     NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMConvolutionLayer(const NEGEMMConvolutionLayer &) = delete;
-    /** Default move constructor */
-    NEGEMMConvolutionLayer(NEGEMMConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMConvolutionLayer(NEGEMMConvolutionLayer &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMConvolutionLayer &operator=(const NEGEMMConvolutionLayer &) = delete;
-    /** Default move assignment operator */
-    NEGEMMConvolutionLayer &operator=(NEGEMMConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMConvolutionLayer &operator=(NEGEMMConvolutionLayer &&) = delete;
+    /** Default destructor */
+    ~NEGEMMConvolutionLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -253,10 +268,10 @@ class NEGEMMConvolutionLayer : public IFunction
     IWeightsManager                                                   *_weights_manager;
     NEConvolutionLayerReshapeWeights                                   _reshape_weights;
     weights_transformations::NEConvolutionLayerReshapeWeightsTransform _reshape_weights_managed;
-    NEIm2ColKernel                                                     _im2col_kernel;
+    std::unique_ptr<NEIm2ColKernel>                                    _im2col_kernel;
     NEGEMM                                                             _mm_gemm;
     NEGEMMLowpMatrixMultiplyCore                                       _mm_gemmlowp;
-    NECol2ImKernel                                                     _col2im_kernel;
+    std::unique_ptr<NECol2ImKernel>                                    _col2im_kernel;
     NEReshapeLayer                                                     _reshape_layer;
 
     const ITensor *_original_weights;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
index 58cb383c67..7195c71063 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
deleted file mode 100644
index 9813b34661..0000000000
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H
-#define ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to execute matrix multiply assembly kernels. */
-class NEGEMMLowpAssemblyMatrixMultiplyCore : public IFunction
-{
-public:
-    /** Constructor */
-    NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  a      First input tensor  (Matrix A). Data type supported: U8, S8.
-     * @param[in]  b      Second input tensor (Matrix B). Data type supported: same as @p a
-     * @param[in]  c      Third input tensor (Matrix C). Data type supported: same as @p a
-     * @param[out] output Output tensor. Data type supported: Data type supported: U32, S32
-     */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                _memory_group;
-    NEGEMMAssemblyDispatch     _asm_glue;
-    std::unique_ptr<INEKernel> _mm_kernel;
-    std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
-    std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
-    Tensor                     _tmp_a;
-    Tensor                     _tmp_b;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index 01720f05fa..cb1d6bd782 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -25,15 +25,6 @@
 #define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
 
 #include "NEActivationLayer.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -45,6 +36,15 @@
 namespace arm_compute
 {
 class ITensor;
+class NEConvertQuantizedSignednessKernel;
+class NEConvertQuantizedSignednessKernel;
+class NEGEMMInterleave4x4Kernel;
+class NEGEMMLowpMatrixMultiplyKernel;
+class NEGEMMLowpOffsetContributionKernel;
+class NEGEMMLowpOffsetContributionOutputStageKernel;
+class NEGEMMLowpMatrixAReductionKernel;
+class NEGEMMLowpMatrixBReductionKernel;
+class NEGEMMTranspose1xWKernel;
 
 /** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following NEON kernels if the DOT product instruction is not available:
  *
@@ -72,6 +72,8 @@ class NEGEMMLowpMatrixMultiplyCore : public IFunction
     NEGEMMLowpMatrixMultiplyCore &operator=(const NEGEMMLowpMatrixMultiplyCore &) = delete;
     /** Default move assignment operator */
     NEGEMMLowpMatrixMultiplyCore &operator=(NEGEMMLowpMatrixMultiplyCore &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpMatrixMultiplyCore();
     /** Initialise the kernel's inputs, output
      *
      * @note GEMM_LOWP:  low precision GEMM kernel
@@ -111,19 +113,19 @@ class NEGEMMLowpMatrixMultiplyCore : public IFunction
     void prepare() override;
 
 private:
-    MemoryGroup                                   _memory_group;
-    IWeightsManager                              *_weights_manager;
-    NEGEMMAssemblyDispatch                        _asm_glue;
-    NEGEMMLowpMatrixMultiplyKernel                _mm_kernel;
-    NEGEMMInterleave4x4Kernel                     _mtx_a_reshape_kernel;
-    NEGEMMTranspose1xWKernel                      _mtx_b_reshape_kernel;
-    NEGEMMLowpMatrixAReductionKernel              _mtx_a_reduction_kernel;
-    NEGEMMLowpMatrixBReductionKernel              _mtx_b_reduction_kernel;
-    NEGEMMLowpOffsetContributionKernel            _offset_contribution_kernel;
-    NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
-    NEActivationLayer                             _activation_func;
-    NEConvertQuantizedSignednessKernel            _convert_to_signed_asymm;
-    NEConvertQuantizedSignednessKernel            _convert_from_signed_asymm;
+    MemoryGroup                                                    _memory_group;
+    IWeightsManager                                               *_weights_manager;
+    NEGEMMAssemblyDispatch                                         _asm_glue;
+    std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel>                _mm_kernel;
+    std::unique_ptr<NEGEMMInterleave4x4Kernel>                     _mtx_a_reshape_kernel;
+    std::unique_ptr<NEGEMMTranspose1xWKernel>                      _mtx_b_reshape_kernel;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
+    std::unique_ptr<NEGEMMLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
+    std::unique_ptr<NEGEMMLowpOffsetContributionKernel>            _offset_contribution_kernel;
+    std::unique_ptr<NEGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
+    NEActivationLayer                                              _activation_func;
+    std::unique_ptr<NEConvertQuantizedSignednessKernel>            _convert_to_signed_asymm;
+    std::unique_ptr<NEConvertQuantizedSignednessKernel>            _convert_from_signed_asymm;
 
     Tensor         _vector_sum_col;
     Tensor         _vector_sum_row;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index f29d5d464b..6977d27cb6 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H
 #define ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 /** This file contains all available output stages for GEMMLowp on NEON.
@@ -37,6 +38,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on NEON.
  *
@@ -69,6 +71,18 @@ class ITensor;
 class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &&) = delete;
+    /** Default destructor */
+    ~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint();
     /** Initialise the kernel's inputs, output
      *
      * @param[in]  input                        Input tensor. Data type supported: S32
@@ -129,6 +143,18 @@ class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public INESimpleFunc
 class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &&) = delete;
+    /** Default destructor */
+    ~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint();
     /** Initialise the kernel's inputs, output
      *
      * @param[in]  input                        Input tensor. Data type supported: S32
@@ -189,6 +215,18 @@ class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public INESimpleFunct
 class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &&) = delete;
+    /** Default destructor */
+    ~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint();
     /** Initialise the kernel's inputs, output
      *
      * @param[in]  input                        Input tensor. Data type supported: S32
@@ -230,6 +268,18 @@ class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public INESimpleFunc
 class NEGEMMLowpOutputStage : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEGEMMLowpOutputStage() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpOutputStage(const NEGEMMLowpOutputStage &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpOutputStage &operator=(const NEGEMMLowpOutputStage &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMLowpOutputStage(NEGEMMLowpOutputStage &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMLowpOutputStage &operator=(NEGEMMLowpOutputStage &&) = delete;
+    /** Default destructor */
+    ~NEGEMMLowpOutputStage();
     /** Initialise the kernel's inputs, output
      *
      * @param[in]  input  Input tensor. Data type supported: S32
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
index 983c95d732..723a638d76 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,12 +24,14 @@
 #ifndef ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H
 #define ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to execute NEGEMMTranspose1xWKernel. This function calls the following NEON kernels:
  *
@@ -39,6 +41,18 @@ class ITensor;
 class NEGEMMTranspose1xW : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEGEMMTranspose1xW() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMTranspose1xW(const NEGEMMTranspose1xW &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMTranspose1xW &operator=(const NEGEMMTranspose1xW &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMTranspose1xW(NEGEMMTranspose1xW &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMTranspose1xW &operator=(NEGEMMTranspose1xW &&) = delete;
+    /** Default destructor */
+    ~NEGEMMTranspose1xW() = default;
     /** Initialise the kernel's inputs, output
      *
      * @param[in]  input  First input tensor. Data type supported: All
diff --git a/arm_compute/runtime/NEON/functions/NEGather.h b/arm_compute/runtime/NEON/functions/NEGather.h
index b872c44443..a5e0461227 100644
--- a/arm_compute/runtime/NEON/functions/NEGather.h
+++ b/arm_compute/runtime/NEON/functions/NEGather.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,7 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEGatherKernel */
 class NEGather : public INESimpleFunctionNoBorder
diff --git a/arm_compute/runtime/NEON/functions/NEGaussian3x3.h b/arm_compute/runtime/NEON/functions/NEGaussian3x3.h
index 54fe91b975..9341c76d85 100644
--- a/arm_compute/runtime/NEON/functions/NEGaussian3x3.h
+++ b/arm_compute/runtime/NEON/functions/NEGaussian3x3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,8 @@ class ITensor;
  * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref NEGaussian3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEGaussian3x3 : public INESimpleFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
index 2e042e2307..51ebee3e8e 100644
--- a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
+++ b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_NEGAUSSIAN5x5_H
 #define ARM_COMPUTE_NEGAUSSIAN5x5_H
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -38,6 +36,9 @@
 namespace arm_compute
 {
 class ITensor;
+class NEGaussian5x5HorKernel;
+class NEGaussian5x5VertKernel;
+class NEFillBorderKernel;
 
 /** Basic function to execute gaussian filter 5x5. This function calls the following NEON kernels:
  *
@@ -45,6 +46,8 @@ class ITensor;
  * -# @ref NEGaussian5x5HorKernel
  * -# @ref NEGaussian5x5VertKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEGaussian5x5 : public IFunction
 {
@@ -52,6 +55,16 @@ class NEGaussian5x5 : public IFunction
     /** Default constructor
      */
     NEGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian5x5(const NEGaussian5x5 &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian5x5 &operator=(const NEGaussian5x5 &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussian5x5(NEGaussian5x5 &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussian5x5 &operator=(NEGaussian5x5 &&) = default;
+    /** Default destructor */
+    ~NEGaussian5x5();
     /** Initialise the function's input, output and border mode.
      *
      * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -65,11 +78,11 @@ class NEGaussian5x5 : public IFunction
     void run() override;
 
 protected:
-    MemoryGroup             _memory_group;   /**< Function memory group */
-    NEGaussian5x5HorKernel  _kernel_hor;     /**< kernel for horizontal pass */
-    NEGaussian5x5VertKernel _kernel_vert;    /**< kernel for vertical pass */
-    Tensor                  _tmp;            /**< temporary buffer for output of horizontal pass */
-    NEFillBorderKernel      _border_handler; /**< kernel to handle tensor borders */
+    MemoryGroup                              _memory_group;   /**< Function memory group */
+    std::unique_ptr<NEGaussian5x5HorKernel>  _kernel_hor;     /**< kernel for horizontal pass */
+    std::unique_ptr<NEGaussian5x5VertKernel> _kernel_vert;    /**< kernel for vertical pass */
+    Tensor                                   _tmp;            /**< temporary buffer for output of horizontal pass */
+    std::unique_ptr<NEFillBorderKernel>      _border_handler; /**< kernel to handle tensor borders */
 };
 }
 #endif /*ARM_COMPUTE_NEGAUSSIAN5x5_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
index d82f763f95..f5a1272b53 100644
--- a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
+++ b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_NEGAUSSIANPYRAMID_H
 
 #include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
@@ -39,8 +38,15 @@
 namespace arm_compute
 {
 class ITensor;
+class NEGaussianPyramidHorKernel;
+class NEGaussianPyramidVertKernel;
+class NEFillBorderKernel;
 
-/** Common interface for all Gaussian pyramid functions */
+/** Common interface for all Gaussian pyramid functions
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+ */
 class NEGaussianPyramid : public IFunction
 {
 public:
@@ -79,22 +85,35 @@ class NEGaussianPyramid : public IFunction
  * -# @ref NEGaussianPyramidHorKernel
  * -# @ref NEGaussianPyramidVertKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+ *
  */
 class NEGaussianPyramidHalf : public NEGaussianPyramid
 {
 public:
     /** Constructor */
     NEGaussianPyramidHalf();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidHalf(const NEGaussianPyramidHalf &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidHalf &operator=(const NEGaussianPyramidHalf &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidHalf(NEGaussianPyramidHalf &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidHalf &operator=(NEGaussianPyramidHalf &&) = default;
+    /** Default destructor */
+    ~NEGaussianPyramidHalf();
 
     // Inherited methods overridden:
     void configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
     void run() override;
 
 private:
-    std::vector<NEFillBorderKernel>          _horizontal_border_handler;
-    std::vector<NEFillBorderKernel>          _vertical_border_handler;
-    std::vector<NEGaussianPyramidHorKernel>  _horizontal_reduction;
-    std::vector<NEGaussianPyramidVertKernel> _vertical_reduction;
+    std::vector<std::unique_ptr<NEFillBorderKernel>>          _horizontal_border_handler;
+    std::vector<std::unique_ptr<NEFillBorderKernel>>          _vertical_border_handler;
+    std::vector<std::unique_ptr<NEGaussianPyramidHorKernel>>  _horizontal_reduction;
+    std::vector<std::unique_ptr<NEGaussianPyramidVertKernel>> _vertical_reduction;
 };
 
 /** Basic function to execute gaussian pyramid with ORB scale factor. This function calls the following NEON kernels and functions:
@@ -103,12 +122,25 @@ class NEGaussianPyramidHalf : public NEGaussianPyramid
  * -# @ref NEGaussian5x5
  * -# @ref NEScaleKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+ *
  */
 class NEGaussianPyramidOrb : public NEGaussianPyramid
 {
 public:
     /** Constructor */
     NEGaussianPyramidOrb();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidOrb(const NEGaussianPyramidOrb &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidOrb &operator=(const NEGaussianPyramidOrb &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidOrb(NEGaussianPyramidOrb &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidOrb &operator=(NEGaussianPyramidOrb &&) = default;
+    /** Default destructor */
+    ~NEGaussianPyramidOrb();
 
     // Inherited methods overridden:
     void configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
index f937832c0e..613f0d1c47 100644
--- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
@@ -24,17 +24,17 @@
 #ifndef ARM_COMPUTE_NEGENERATEPROPOSALSLAYER_H
 #define ARM_COMPUTE_NEGENERATEPROPOSALSLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPadLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
-#include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
+#include "arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -67,6 +67,8 @@ class NEGenerateProposalsLayer : public IFunction
     NEGenerateProposalsLayer(const NEGenerateProposalsLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGenerateProposalsLayer &operator=(const NEGenerateProposalsLayer &) = delete;
+    /** Default destructor */
+    ~NEGenerateProposalsLayer();
 
     /** Set the input and output tensors.
      *
@@ -112,16 +114,16 @@ class NEGenerateProposalsLayer : public IFunction
     MemoryGroup _memory_group;
 
     // Neon kernels
-    NEPermuteKernel              _permute_deltas_kernel;
-    NEReshapeLayer               _flatten_deltas;
-    NEPermuteKernel              _permute_scores_kernel;
-    NEReshapeLayer               _flatten_scores;
-    NEComputeAllAnchorsKernel    _compute_anchors_kernel;
-    NEBoundingBoxTransformKernel _bounding_box_kernel;
-    NEPadLayerKernel             _pad_kernel;
-    NEDequantizationLayerKernel  _dequantize_anchors;
-    NEDequantizationLayerKernel  _dequantize_deltas;
-    NEQuantizationLayerKernel    _quantize_all_proposals;
+    NEPermute              _permute_deltas;
+    NEReshapeLayer         _flatten_deltas;
+    NEPermute              _permute_scores;
+    NEReshapeLayer         _flatten_scores;
+    NEComputeAllAnchors    _compute_anchors;
+    NEBoundingBoxTransform _bounding_box;
+    NEPadLayer             _pad;
+    NEDequantizationLayer  _dequantize_anchors;
+    NEDequantizationLayer  _dequantize_deltas;
+    NEQuantizationLayer    _quantize_all_proposals;
 
     // CPP functions
     CPPBoxWithNonMaximaSuppressionLimit _cpp_nms;
diff --git a/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
index 9b6fc4737b..a2d42fedf8 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NEHOGDESCRIPTOR_H
 #define ARM_COMPUTE_NEHOGDESCRIPTOR_H
 
-#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -36,18 +35,33 @@
 namespace arm_compute
 {
 class IHOG;
+class NEHOGOrientationBinningKernel;
+class NEHOGBlockNormalizationKernel;
+
 /** Basic function to calculate HOG descriptor. This function calls the following NEON kernels:
  *
  * -# @ref NEHOGGradient
  * -# @ref NEHOGOrientationBinningKernel
  * -# @ref NEHOGBlockNormalizationKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEHOGDescriptor : public IFunction
 {
 public:
     /** Default constructor */
     NEHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGDescriptor(const NEHOGDescriptor &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGDescriptor &operator=(const NEHOGDescriptor &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEHOGDescriptor(NEHOGDescriptor &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEHOGDescriptor &operator=(NEHOGDescriptor &&) = delete;
+    /** Default destructor */
+    ~NEHOGDescriptor();
     /** Initialise the function's source, destination, HOG data-object and border mode
      *
      * @param[in, out] input                 Input tensor. Data type supported: U8
@@ -63,13 +77,13 @@ class NEHOGDescriptor : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                   _memory_group;
-    NEHOGGradient                 _gradient;
-    NEHOGOrientationBinningKernel _orient_bin;
-    NEHOGBlockNormalizationKernel _block_norm;
-    Tensor                        _mag;
-    Tensor                        _phase;
-    Tensor                        _hog_space;
+    MemoryGroup                                    _memory_group;
+    NEHOGGradient                                  _gradient;
+    std::unique_ptr<NEHOGOrientationBinningKernel> _orient_bin;
+    std::unique_ptr<NEHOGBlockNormalizationKernel> _block_norm;
+    Tensor                                         _mag;
+    Tensor                                         _phase;
+    Tensor                                         _hog_space;
 };
 }
 
diff --git a/arm_compute/runtime/NEON/functions/NEHOGDetector.h b/arm_compute/runtime/NEON/functions/NEHOGDetector.h
index 6400d3c367..644851ee92 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGDetector.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGDetector.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,36 @@
 #ifndef ARM_COMPUTE_NEHOGDETECTOR_H
 #define ARM_COMPUTE_NEHOGDETECTOR_H
 
+#include "arm_compute/core/IArray.h"
 #include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
+class ITensor;
+class ITensorInfo;
 /** Basic function to execute HOG detector based on linear SVM. This function calls the following NEON kernel:
  *
  * -# @ref NEHOGDetectorKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEHOGDetector : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEHOGDetector() = default;
+    /** Prevent instances of this class from being copied */
+    NEHOGDetector(const NEHOGDetector &) = delete;
+    /** Default move constructor */
+    NEHOGDetector(NEHOGDetector &&) = default;
+    /** Prevent instances of this class from being copied */
+    NEHOGDetector &operator=(const NEHOGDetector &) = delete;
+    /** Default move assignment operator */
+    NEHOGDetector &operator=(NEHOGDetector &&) = default;
+    /** Destructor */
+    ~NEHOGDetector();
     /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
      *
      * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
diff --git a/arm_compute/runtime/NEON/functions/NEHOGGradient.h b/arm_compute/runtime/NEON/functions/NEHOGGradient.h
index 2d3f934f54..426bc4b23c 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGGradient.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGGradient.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NEHOGGRADIENT_H
 #define ARM_COMPUTE_NEHOGGRADIENT_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -38,17 +37,31 @@
 namespace arm_compute
 {
 class ITensor;
+class ICPPKernel;
+
 /** Basic function to calculate the gradient for HOG. This function calls the following NEON kernels:
  *
  * -# @ref NEDerivative
  * -# NEMagnitudePhaseKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEHOGGradient : public IFunction
 {
 public:
     /** Default constructor */
     NEHOGGradient(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGGradient(const NEHOGGradient &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGGradient &operator=(const NEHOGGradient &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEHOGGradient(NEHOGGradient &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEHOGGradient &operator=(NEHOGGradient &&) = delete;
+    /** Default destructor */
+    ~NEHOGGradient();
     /** Initialise the function's source, destinations, phase type and border mode
      *
      * @param[in, out] input                 Input tensor. Data type supported: U8.
@@ -65,11 +78,11 @@ class NEHOGGradient : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                _memory_group;
-    NEDerivative               _derivative;
-    std::unique_ptr<INEKernel> _mag_phase;
-    Tensor                     _gx;
-    Tensor                     _gy;
+    MemoryGroup                 _memory_group;
+    NEDerivative                _derivative;
+    std::unique_ptr<ICPPKernel> _mag_phase;
+    Tensor                      _gx;
+    Tensor                      _gy;
 };
 }
 #endif /*ARM_COMPUTE_NEHOGGRADIENT_H */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
index ff64afb119..f370dd29ab 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,6 @@
 #include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/core/IMultiHOG.h"
-#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -39,6 +38,9 @@
 
 namespace arm_compute
 {
+class NEHOGOrientationBinningKernel;
+class NEHOGBlockNormalizationKernel;
+
 /** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following NEON kernels:
  *
  * -# @ref NEHOGGradient
@@ -52,6 +54,8 @@ namespace arm_compute
          -# Normalization type
          -# L2 hysteresis threshold if the normalization type is L2HYS_NORM
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEHOGMultiDetection : public IFunction
 {
@@ -60,8 +64,14 @@ class NEHOGMultiDetection : public IFunction
     NEHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEHOGMultiDetection(const NEHOGMultiDetection &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEHOGMultiDetection(NEHOGMultiDetection &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEHOGMultiDetection &operator=(const NEHOGMultiDetection &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEHOGMultiDetection &operator=(NEHOGMultiDetection &&) = delete;
+    /** Default destructor */
+    ~NEHOGMultiDetection();
     /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
      *
      * @param[in, out] input                    Input tensor. Data type supported: U8
diff --git a/arm_compute/runtime/NEON/functions/NEHarrisCorners.h b/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
index c086e3a7ce..477b843aee 100644
--- a/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
+++ b/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,6 @@
 
 #include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Array.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -42,6 +40,8 @@
 namespace arm_compute
 {
 class ITensor;
+class NEFillBorderKernel;
+class INEHarrisScoreKernel;
 using IImage = ITensor;
 
 /** Basic function to execute harris corners detection. This function calls the following NEON kernels and functions:
@@ -57,6 +57,8 @@ using IImage = ITensor;
  * -# @ref CPPCornerCandidatesKernel
  * -# @ref CPPSortEuclideanDistanceKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEHarrisCorners : public IFunction
 {
@@ -68,6 +70,16 @@ class NEHarrisCorners : public IFunction
      * @param[in] memory_manager (Optional) Memory manager.
      */
     NEHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHarrisCorners(const NEHarrisCorners &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHarrisCorners &operator=(const NEHarrisCorners &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEHarrisCorners(NEHarrisCorners &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEHarrisCorners &operator=(NEHarrisCorners &&) = delete;
+    /** Default destructor */
+    ~NEHarrisCorners();
     /** Initialize the function's source, destination, conv and border_mode.
      *
      * @param[in, out] input                 Source image. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
@@ -94,8 +106,8 @@ class NEHarrisCorners : public IFunction
     NENonMaximaSuppression3x3             _non_max_suppr;         /**< Non-maxima suppression function */
     CPPCornerCandidatesKernel             _candidates;            /**< Sort kernel */
     CPPSortEuclideanDistanceKernel        _sort_euclidean;        /**< Euclidean distance kernel */
-    NEFillBorderKernel                    _border_gx;             /**< Border handler before running harris score */
-    NEFillBorderKernel                    _border_gy;             /**< Border handler before running harris score */
+    std::unique_ptr<NEFillBorderKernel>   _border_gx;             /**< Border handler before running harris score */
+    std::unique_ptr<NEFillBorderKernel>   _border_gy;             /**< Border handler before running harris score */
     Image                                 _gx;                    /**< Source image - Gx component */
     Image                                 _gy;                    /**< Source image - Gy component */
     Image                                 _score;                 /**< Source image - Harris score */
diff --git a/arm_compute/runtime/NEON/functions/NEHistogram.h b/arm_compute/runtime/NEON/functions/NEHistogram.h
index 716f2e71f9..d922ef1214 100644
--- a/arm_compute/runtime/NEON/functions/NEHistogram.h
+++ b/arm_compute/runtime/NEON/functions/NEHistogram.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,23 +24,40 @@
 #ifndef ARM_COMPUTE_NEHISTOGRAM_H
 #define ARM_COMPUTE_NEHISTOGRAM_H
 
-#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <vector>
 
 namespace arm_compute
 {
+class ITensor;
 class IDistribution1D;
+class NEHistogramKernel;
+using IImage = ITensor;
 
-/** Basic function to run @ref NEHistogramKernel. */
+/** Basic function to run @ref NEHistogramKernel.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+ */
 class NEHistogram : public IFunction
 {
 public:
     /** Default Constructor. */
     NEHistogram();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHistogram(const NEHistogram &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHistogram &operator=(const NEHistogram &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEHistogram(NEHistogram &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEHistogram &operator=(NEHistogram &&) = delete;
+    /** Default destructor */
+    ~NEHistogram();
     /** Initialise the kernel's inputs.
      *
      * @param[in]  input  Input image. Data type supported: U8.
@@ -52,10 +69,10 @@ class NEHistogram : public IFunction
     void run() override;
 
 private:
-    NEHistogramKernel     _histogram_kernel;
-    std::vector<uint32_t> _local_hist;
-    std::vector<uint32_t> _window_lut;
-    size_t                _local_hist_size;
+    std::unique_ptr<NEHistogramKernel> _histogram_kernel;
+    std::vector<uint32_t>              _local_hist;
+    std::vector<uint32_t>              _window_lut;
+    size_t                             _local_hist_size;
     /** 256 possible pixel values as we handle only U8 images */
     static constexpr unsigned int window_lut_default_size = 256;
 };
diff --git a/arm_compute/runtime/NEON/functions/NEIm2Col.h b/arm_compute/runtime/NEON/functions/NEIm2Col.h
index 3ea9c1cfaf..2f023f44fe 100644
--- a/arm_compute/runtime/NEON/functions/NEIm2Col.h
+++ b/arm_compute/runtime/NEON/functions/NEIm2Col.h
@@ -26,14 +26,16 @@
 
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
+class NEIm2ColKernel;
 
 /** Basic function to run @ref NEIm2ColKernel */
 class NEIm2Col : public IFunction
@@ -41,6 +43,16 @@ class NEIm2Col : public IFunction
 public:
     /** Default constructor */
     NEIm2Col();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIm2Col(const NEIm2Col &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIm2Col &operator=(const NEIm2Col &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEIm2Col(NEIm2Col &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEIm2Col &operator=(NEIm2Col &&) = delete;
+    /** Default destructor */
+    ~NEIm2Col();
     /** Configure the im2col NEON kernel
      *
      * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
@@ -78,8 +90,8 @@ class NEIm2Col : public IFunction
     void run() override;
 
 private:
-    NEIm2ColKernel _kernel;
-    unsigned int   _y_dim;
+    std::unique_ptr<NEIm2ColKernel> _kernel;
+    unsigned int                    _y_dim;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEIM2COL_H */
diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
index 85a307c2d4..57165c94b4 100644
--- a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H
 #define ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -37,6 +36,7 @@
 namespace arm_compute
 {
 class ITensor;
+class NEInstanceNormalizationLayerKernel;
 
 /** Basic function to perform a Instance normalization.
  *
@@ -48,6 +48,16 @@ class NEInstanceNormalizationLayer : public IFunction
 public:
     /** Constructor */
     NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEInstanceNormalizationLayer(const NEInstanceNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEInstanceNormalizationLayer &operator=(const NEInstanceNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEInstanceNormalizationLayer(NEInstanceNormalizationLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEInstanceNormalizationLayer &operator=(NEInstanceNormalizationLayer &&) = delete;
+    /** Default destructor */
+    ~NEInstanceNormalizationLayer();
     /** Set the input and output tensors.
      *
      * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr this tensor will store the result of the normalization.
@@ -75,13 +85,13 @@ class NEInstanceNormalizationLayer : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                        _memory_group;
-    NEInstanceNormalizationLayerKernel _normalization_kernel;
-    bool                               _is_nchw;
-    NEPermute                          _permute_input;
-    NEPermute                          _permute_output;
-    Tensor                             _permuted_input;
-    Tensor                             _permuted_output;
+    MemoryGroup                                         _memory_group;
+    std::unique_ptr<NEInstanceNormalizationLayerKernel> _normalization_kernel;
+    bool                                                _is_nchw;
+    NEPermute                                           _permute_input;
+    NEPermute                                           _permute_output;
+    Tensor                                              _permuted_input;
+    Tensor                                              _permuted_output;
 };
 }
 #endif /* ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEIntegralImage.h b/arm_compute/runtime/NEON/functions/NEIntegralImage.h
index 6302a7adac..31c0ec9ebe 100644
--- a/arm_compute/runtime/NEON/functions/NEIntegralImage.h
+++ b/arm_compute/runtime/NEON/functions/NEIntegralImage.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,10 +30,26 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to run a @ref NEIntegralImageKernel */
+/** Basic function to run a @ref NEIntegralImageKernel
+ *
+* @deprecated This function is deprecated and is intended to be removed in 21.05 release
+*
+*/
 class NEIntegralImage : public INESimpleFunction
 {
 public:
+    /** Constructor */
+    NEIntegralImage() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIntegralImage(const NEIntegralImage &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIntegralImage &operator=(const NEIntegralImage &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEIntegralImage(NEIntegralImage &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEIntegralImage &operator=(NEIntegralImage &&) = delete;
+    /** Default destructor */
+    ~NEIntegralImage();
     /** Initialise the function's source, destinations and border mode.
      *
      * @param[in]  input  Source tensor. Data type supported: U8.
diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
index 31e0c61409..173b9d2141 100644
--- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NEL2NORMALIZELAYER_H
 #define ARM_COMPUTE_NEL2NORMALIZELAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -36,6 +35,7 @@
 namespace arm_compute
 {
 class ITensor;
+class NEL2NormalizeLayerKernel;
 
 /** Basic function to perform a L2 normalization on a given axis.
  *
@@ -48,6 +48,16 @@ class NEL2NormalizeLayer : public IFunction
 public:
     /** Constructor */
     NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEL2NormalizeLayer(const NEL2NormalizeLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEL2NormalizeLayer &operator=(const NEL2NormalizeLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEL2NormalizeLayer(NEL2NormalizeLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEL2NormalizeLayer &operator=(NEL2NormalizeLayer &&) = delete;
+    /** Default destructor */
+    ~NEL2NormalizeLayer();
     /** Set the input and output tensors.
      *
      * @param[in, out] input   Source tensor. Data types supported: F16/F32. (Written to only for border_size != 0)
@@ -55,7 +65,7 @@ class NEL2NormalizeLayer : public IFunction
      * @param[in]      axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
      * @param[in]      epsilon (Optional) Lower bound value for the normalization.
      */
-    void configure(ITensor *input, ITensor *output, int axis, float epsilon = 1e-12f);
+    void configure(ITensor *input, ITensor *output, int axis, float epsilon = 1e-6f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEL2NormalizeLayer.
      *
@@ -66,16 +76,16 @@ class NEL2NormalizeLayer : public IFunction
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int axis, float epsilon = 1e-12f);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int axis, float epsilon = 1e-6f);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    MemoryGroup              _memory_group;
-    NEReductionOperation     _reduce_func;
-    NEL2NormalizeLayerKernel _normalize_kernel;
-    Tensor                   _sumsq;
+    MemoryGroup                               _memory_group;
+    NEReductionOperation                      _reduce_func;
+    std::unique_ptr<NEL2NormalizeLayerKernel> _normalize_kernel;
+    Tensor                                    _sumsq;
 };
 }
 #endif /* ARM_COMPUTE_NEL2NORMALIZELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index 4a47dfb2cf..ef8defb827 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
@@ -24,18 +24,17 @@
 #ifndef ARM_COMPUTE_NELSTMLAYER_H
 #define ARM_COMPUTE_NELSTMLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NECopy.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
@@ -49,6 +48,16 @@ class NELSTMLayer : public IFunction
 public:
     /** Default constructor */
     NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELSTMLayer(const NELSTMLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELSTMLayer &operator=(const NELSTMLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELSTMLayer(NELSTMLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELSTMLayer &operator=(NELSTMLayer &&) = delete;
+    /** Default destructor */
+    ~NELSTMLayer();
     /** Initialize function's tensors.
      *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: F16/F32.
@@ -158,7 +167,7 @@ class NELSTMLayer : public IFunction
     NEActivationLayer              _activation_forget_gate;
     NEFullyConnectedLayer          _fully_connected_cell_state;
     NEGEMM                         _gemm_cell_state1;
-    NETransposeKernel              _transpose_cell_state;
+    NETranspose                    _transpose_cell_state;
     NEArithmeticAddition           _accum_cell_state1;
     NEArithmeticAddition           _accum_cell_state2;
     NEPixelWiseMultiplication      _pixelwise_mul_cell_state1;
@@ -173,8 +182,8 @@ class NELSTMLayer : public IFunction
     NEPixelWiseMultiplication      _pixelwise_mul_output_state2;
     NEFullyConnectedLayer          _fully_connected_output_state;
     NEActivationLayer              _projection_clip;
-    NECopyKernel                   _copy_cell_state;
-    NECopyKernel                   _copy_output;
+    NECopy                         _copy_cell_state;
+    NECopy                         _copy_output;
     NEConcatenateLayer             _concat_scratch_buffer;
     NEConcatenateLayer             _concat_inputs_forget_gate;
     NEConcatenateLayer             _concat_weights_forget_gate;
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index 377e173e7d..a354a4df7b 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,12 +67,14 @@ class NELSTMLayerQuantized : public IFunction
     NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NELSTMLayerQuantized(const NELSTMLayerQuantized &) = delete;
-    /** Default move constructor */
-    NELSTMLayerQuantized(NELSTMLayerQuantized &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NELSTMLayerQuantized(NELSTMLayerQuantized &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NELSTMLayerQuantized &operator=(const NELSTMLayerQuantized &) = delete;
-    /** Default move assignment operator */
-    NELSTMLayerQuantized &operator=(NELSTMLayerQuantized &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NELSTMLayerQuantized &operator=(NELSTMLayerQuantized &&) = delete;
+    /** Default destructor */
+    ~NELSTMLayerQuantized();
     /** Initialize function's tensors.
      *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8.
diff --git a/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
index 1f317f6dd8..9ca30141a6 100644
--- a/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
+++ b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,12 +50,25 @@ class ITensor;
  *  difference between the two tensors is the corresponding level L(i) of the Laplacian pyramid.
  *  L(i) = I(i) - Gaussian5x5(I(i))
  *  Level 0 has always the same first two dimensions as the input tensor.
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
 */
 class NELaplacianPyramid : public IFunction
 {
 public:
     /** Constructor */
     NELaplacianPyramid();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELaplacianPyramid(const NELaplacianPyramid &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELaplacianPyramid &operator=(const NELaplacianPyramid &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELaplacianPyramid(NELaplacianPyramid &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELaplacianPyramid &operator=(NELaplacianPyramid &&) = delete;
+    /** Default destructor */
+    ~NELaplacianPyramid();
     /** Initialise the function's source, destinations and border mode.
      *
      * @param[in]  input                 Source tensor. Data type supported: U8.
diff --git a/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
index cc4aa0876b..8e0a3efff0 100644
--- a/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
+++ b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,12 +56,25 @@ using IImage = ITensor;
  *  I(i-1) = upsample(I(i) + L(i))
  *
  *  output = I(0) + L(0)
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
 */
 class NELaplacianReconstruct : public IFunction
 {
 public:
     /** Constructor */
     NELaplacianReconstruct();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELaplacianReconstruct(const NELaplacianReconstruct &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELaplacianReconstruct &operator=(const NELaplacianReconstruct &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELaplacianReconstruct(NELaplacianReconstruct &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELaplacianReconstruct &operator=(NELaplacianReconstruct &&) = delete;
+    /** Default destructor */
+    ~NELaplacianReconstruct();
     /** Initialise the function's source, destinations and border mode.
      *
      * The Output image must have the same size as the first level of the pyramid.
diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
index e76f6b3515..86e6300130 100644
--- a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,13 +26,11 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
-#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NECol2Im.h"
+#include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -40,6 +38,8 @@
 namespace arm_compute
 {
 class INETensor;
+class NEWeightsReshapeKernel;
+class NELocallyConnectedMatrixMultiplyKernel;
 
 /** Basic function to compute the locally connected layer. This function calls the following NEON kernels:
  *
@@ -55,12 +55,14 @@ class NELocallyConnectedLayer : public IFunction
     NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NELocallyConnectedLayer(const NELocallyConnectedLayer &) = delete;
-    /** Default move constructor */
-    NELocallyConnectedLayer(NELocallyConnectedLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NELocallyConnectedLayer(NELocallyConnectedLayer &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NELocallyConnectedLayer &operator=(const NELocallyConnectedLayer &) = delete;
-    /** Default move assignment operator */
-    NELocallyConnectedLayer &operator=(NELocallyConnectedLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NELocallyConnectedLayer &operator=(NELocallyConnectedLayer &&) = delete;
+    /** Default destructor */
+    ~NELocallyConnectedLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -72,6 +74,7 @@ class NELocallyConnectedLayer : public IFunction
      *                       Data types supported: Same as @p input.
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.11)
     void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NELocallyConnectedLayer
      *
@@ -93,16 +96,16 @@ class NELocallyConnectedLayer : public IFunction
     void prepare() override;
 
 private:
-    MemoryGroup                            _memory_group;
-    NEIm2ColKernel                         _input_im2col_kernel;
-    NEWeightsReshapeKernel                 _weights_reshape_kernel;
-    NELocallyConnectedMatrixMultiplyKernel _mm_kernel;
-    NECol2ImKernel                         _output_col2im_kernel;
-    Tensor                                 _input_im2col_reshaped;
-    Tensor                                 _weights_reshaped;
-    Tensor                                 _gemm_output;
-    bool                                   _is_prepared;
-    const ITensor                         *_original_weights;
+    MemoryGroup                                             _memory_group;
+    NEIm2Col                                                _input_im2col;
+    std::unique_ptr<NEWeightsReshapeKernel>                 _weights_reshape_kernel;
+    std::unique_ptr<NELocallyConnectedMatrixMultiplyKernel> _mm_kernel;
+    NECol2Im                                                _output_col2im;
+    Tensor                                                  _input_im2col_reshaped;
+    Tensor                                                  _weights_reshaped;
+    Tensor                                                  _gemm_output;
+    bool                                                    _is_prepared;
+    const ITensor                                          *_original_weights;
 };
 }
 #endif /* ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NELogical.h b/arm_compute/runtime/NEON/functions/NELogical.h
new file mode 100644
index 0000000000..04ffce6221
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NELogical.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NELOGICAL_H
+#define ARM_COMPUTE_NELOGICAL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Macros.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+class ITensorInfo;
+
+/** Basic function to perform logical AND */
+class NELogicalAnd : public IFunction
+{
+public:
+    /** Constructor */
+    NELogicalAnd();
+    /** Destructor */
+    ~NELogicalAnd();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE_INC(NELogicalAnd)
+
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 First tensor input. Data type supported: U8.
+     * @param[in]  input2 Second tensor input. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogicalAnd
+     *
+     * @param[in] input1 First input tensor info. Data types supported: U8.
+     * @param[in] input2 Second input tensor info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data type supported: U8
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+/** Basic function to perform logical OR */
+class NELogicalOr : public IFunction
+{
+public:
+    /** Constructor */
+    NELogicalOr();
+    /** Destructor */
+    ~NELogicalOr();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE_INC(NELogicalOr)
+
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 First tensor input. Data type supported: U8.
+     * @param[in]  input2 Second tensor input. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogicalOr
+     *
+     * @param[in] input1 First input tensor info. Data types supported: U8.
+     * @param[in] input2 Second input tensor info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data type supported: U8
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+/** Basic function to perform logical NOT */
+class NELogicalNot : public IFunction
+{
+public:
+    /** Constructor */
+    NELogicalNot();
+    /** Destructor */
+    ~NELogicalNot();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE_INC(NELogicalNot)
+
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input  Input tensor. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogicalNot
+     *
+     * @param[in] input  Input tensor info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data type supported: U8
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NELOGICAL_H */
diff --git a/arm_compute/runtime/NEON/functions/NEMagnitude.h b/arm_compute/runtime/NEON/functions/NEMagnitude.h
index 56c88c2125..e100de2e08 100644
--- a/arm_compute/runtime/NEON/functions/NEMagnitude.h
+++ b/arm_compute/runtime/NEON/functions/NEMagnitude.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,16 +24,33 @@
 #ifndef ARM_COMPUTE_NEMAGNITUDE_H
 #define ARM_COMPUTE_NEMAGNITUDE_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to run @ref NEMagnitudePhaseKernel */
+/** Basic function to run @ref NEMagnitudePhaseKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+ */
 class NEMagnitude : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEMagnitude() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitude(const NEMagnitude &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitude &operator=(const NEMagnitude &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMagnitude(NEMagnitude &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMagnitude &operator=(NEMagnitude &&) = delete;
+    /** Default destructor */
+    ~NEMagnitude();
     /** Initialise the kernel's inputs.
      *
      * @param[in]  input1   First tensor input. Data type supported: S16.
diff --git a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
index f13b4bd9e2..5b5bb5cb78 100644
--- a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
@@ -24,14 +24,16 @@
 #ifndef ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H
 #define ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
+class NEMemsetKernel;
+class NEMaxUnpoolingLayerKernel;
 
 /** Function to perform MaxUnpooling. This function calls the following NEON kernels:
  *
@@ -43,6 +45,16 @@ class NEMaxUnpoolingLayer : public IFunction
 public:
     /** Constructor */
     NEMaxUnpoolingLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMaxUnpoolingLayer(const NEMaxUnpoolingLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMaxUnpoolingLayer &operator=(const NEMaxUnpoolingLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMaxUnpoolingLayer(NEMaxUnpoolingLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMaxUnpoolingLayer &operator=(NEMaxUnpoolingLayer &&) = delete;
+    /** Default destructor */
+    ~NEMaxUnpoolingLayer();
     /** Set the input and output tensors.
      *
      * @note Only supported pool size 2
@@ -70,8 +82,8 @@ class NEMaxUnpoolingLayer : public IFunction
     void run() override;
 
 private:
-    NEMemsetKernel            _memset_kernel;
-    NEMaxUnpoolingLayerKernel _unpooling_layer_kernel;
+    std::unique_ptr<NEMemsetKernel>            _memset_kernel;
+    std::unique_ptr<NEMaxUnpoolingLayerKernel> _unpooling_layer_kernel;
 };
 }
 #endif /* ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEMeanStdDev.h b/arm_compute/runtime/NEON/functions/NEMeanStdDev.h
index 120f703140..875c3630c1 100644
--- a/arm_compute/runtime/NEON/functions/NEMeanStdDev.h
+++ b/arm_compute/runtime/NEON/functions/NEMeanStdDev.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,15 +24,18 @@
 #ifndef ARM_COMPUTE_NEMEANSTDDEV_H
 #define ARM_COMPUTE_NEMEANSTDDEV_H
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "arm_compute/core/IMultiImage.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include <memory>
 
 #include <cstdint>
 
 namespace arm_compute
 {
+class NEMeanStdDevKernel;
+class NEFillBorderKernel;
+
 /** Basic function to execute mean and std deviation. This function calls the following NEON kernels:
  *
  * @ref NEMeanStdDevKernel
@@ -43,6 +46,16 @@ class NEMeanStdDev : public IFunction
 public:
     /** Default Constructor. */
     NEMeanStdDev();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDev(const NEMeanStdDev &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDev &operator=(const NEMeanStdDev &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMeanStdDev(NEMeanStdDev &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMeanStdDev &operator=(NEMeanStdDev &&) = delete;
+    /** Default destructor */
+    ~NEMeanStdDev();
     /** Initialise the kernel's inputs and outputs.
      *
      * @param[in, out] input  Input image. Data types supported: U8. (Written to only for border filling)
@@ -55,10 +68,10 @@ class NEMeanStdDev : public IFunction
     void run() override;
 
 private:
-    NEMeanStdDevKernel _mean_stddev_kernel; /**< Kernel that standard deviation calculation. */
-    NEFillBorderKernel _fill_border_kernel; /**< Kernel that fills tensor's borders with zeroes. */
-    uint64_t           _global_sum;         /**< Variable that holds the global sum among calls in order to ease reduction */
-    uint64_t           _global_sum_squared; /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
+    std::unique_ptr<NEMeanStdDevKernel> _mean_stddev_kernel; /**< Kernel that standard deviation calculation. */
+    std::unique_ptr<NEFillBorderKernel> _fill_border_kernel; /**< Kernel that fills tensor's borders with zeroes. */
+    uint64_t                            _global_sum;         /**< Variable that holds the global sum among calls in order to ease reduction */
+    uint64_t                            _global_sum_squared; /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
 };
 }
 #endif /*ARM_COMPUTE_NEMEANSTDDEV_H */
diff --git a/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h
index 132ab8a01b..31e376191c 100644
--- a/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,11 +30,24 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to execute mean and standard deviation normalization by calling @ref NEMeanStdDevNormalizationKernel */
 class NEMeanStdDevNormalizationLayer : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEMeanStdDevNormalizationLayer() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevNormalizationLayer(const NEMeanStdDevNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevNormalizationLayer &operator=(const NEMeanStdDevNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMeanStdDevNormalizationLayer(NEMeanStdDevNormalizationLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMeanStdDevNormalizationLayer &operator=(NEMeanStdDevNormalizationLayer &&) = delete;
+    /** Default destructor */
+    ~NEMeanStdDevNormalizationLayer();
     /** Initialise the function's input and outputs.
      *
      * @note If the output tensor is a nullptr, the normalization will be performed in-place.
diff --git a/arm_compute/runtime/NEON/functions/NEMedian3x3.h b/arm_compute/runtime/NEON/functions/NEMedian3x3.h
index 8d860e2103..7e1ec905c6 100644
--- a/arm_compute/runtime/NEON/functions/NEMedian3x3.h
+++ b/arm_compute/runtime/NEON/functions/NEMedian3x3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,8 @@ class ITensor;
  * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref NEMedian3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEMedian3x3 : public INESimpleFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
index caa66a0c16..312d1cb668 100644
--- a/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
+++ b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,27 +25,42 @@
 #define ARM_COMPUTE_NEMINMAXLOCATION_H
 
 #include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
 #include "arm_compute/runtime/Array.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class NEMinMaxKernel;
+class NEMinMaxLocationKernel;
 using IImage = ITensor;
 
 /** Basic function to execute min and max location. This function calls the following NEON kernels:
  *
  * -# NEMinMaxKernel
  * -# NEMinMaxLocationKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEMinMaxLocation : public IFunction
 {
 public:
     /** Constructor */
     NEMinMaxLocation();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxLocation(const NEMinMaxLocation &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxLocation &operator=(const NEMinMaxLocation &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMinMaxLocation(NEMinMaxLocation &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMinMaxLocation &operator=(NEMinMaxLocation &&) = delete;
+    /** Default destructor */
+    ~NEMinMaxLocation();
     /** Initialise the kernel's inputs and outputs.
      *
      * @param[in]  input     Input image. Data types supported: U8/S16/F32.
@@ -64,8 +79,8 @@ class NEMinMaxLocation : public IFunction
     void run() override;
 
 private:
-    NEMinMaxKernel         _min_max;     /**< Kernel that performs min/max */
-    NEMinMaxLocationKernel _min_max_loc; /**< Kernel that extracts min/max locations */
+    std::unique_ptr<NEMinMaxKernel>         _min_max;     /**< Kernel that performs min/max */
+    std::unique_ptr<NEMinMaxLocationKernel> _min_max_loc; /**< Kernel that extracts min/max locations */
 };
 }
 #endif /*ARM_COMPUTE_NEMINMAXLOCATION_H */
diff --git a/arm_compute/runtime/NEON/functions/NENonLinearFilter.h b/arm_compute/runtime/NEON/functions/NENonLinearFilter.h
index d2a85837fd..8642350736 100644
--- a/arm_compute/runtime/NEON/functions/NENonLinearFilter.h
+++ b/arm_compute/runtime/NEON/functions/NENonLinearFilter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,9 @@ class ITensor;
  * -# @ref NENonLinearFilterKernel
  *
  * @note Supported mask dimensions squares of sizes 3, 5
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NENonLinearFilter : public INESimpleFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
index 07d4b16cf1..5b71d52e3e 100644
--- a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
+++ b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,8 @@ class ITensor;
  * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref NENonMaximaSuppression3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NENonMaximaSuppression3x3 : public INESimpleFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index fcdba12046..6519f9b4e6 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -26,8 +26,6 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -39,6 +37,7 @@
 namespace arm_compute
 {
 class ITensor;
+class NENormalizationLayerKernel;
 
 /** Basic function to compute a normalization layer. This function calls the following NEON kernels:
  *
@@ -52,6 +51,16 @@ class NENormalizationLayer : public IFunction
 public:
     /** Default constructor */
     NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENormalizationLayer(const NENormalizationLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENormalizationLayer &operator=(const NENormalizationLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NENormalizationLayer(NENormalizationLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NENormalizationLayer &operator=(NENormalizationLayer &&) = delete;
+    /** Default destructor */
+    ~NENormalizationLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -75,10 +84,10 @@ class NENormalizationLayer : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                _memory_group;  /**< Function memory group */
-    NENormalizationLayerKernel _norm_kernel;   /**< Normalization layer kernel */
-    NEPixelWiseMultiplication  _multiply_f;    /**< Pixel multiplication function */
-    Tensor                     _input_squared; /**< The intermediate buffer which stores results of squaring input */
+    MemoryGroup                                 _memory_group;  /**< Function memory group */
+    std::unique_ptr<NENormalizationLayerKernel> _norm_kernel;   /**< Normalization layer kernel */
+    NEPixelWiseMultiplication                   _multiply_f;    /**< Pixel multiplication function */
+    Tensor                                      _input_squared; /**< The intermediate buffer which stores results of squaring input */
 };
 }
 #endif /* ARM_COMPUTE_NENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEOpticalFlow.h b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
index 141ee7ea41..d1624ec68a 100644
--- a/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
+++ b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_NEOPTICALFLOW_H
 
 #include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Array.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -41,6 +41,7 @@
 namespace arm_compute
 {
 class Pyramid;
+class NELKTrackerKernel;
 
 /** Array of LK Internel Keypoints */
 using LKInternalKeypointArray = Array<NELKInternalKeypoint>;
@@ -49,6 +50,9 @@ using LKInternalKeypointArray = Array<NELKInternalKeypoint>;
  * -# @ref NEScharr3x3
  * -# @ref NELKTrackerKernel
  *
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEOpticalFlow : public IFunction
 {
@@ -62,6 +66,8 @@ class NEOpticalFlow : public IFunction
     NEOpticalFlow(const NEOpticalFlow &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEOpticalFlow &operator=(const NEOpticalFlow &) = delete;
+    /** Default destructor */
+    ~NEOpticalFlow();
     /**  Initialise the function input and output
      *
      * @param[in]  old_pyramid           Pointer to the pyramid for the old tensor. Data type supported U8
@@ -86,17 +92,17 @@ class NEOpticalFlow : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                    _memory_group;
-    std::vector<NEScharr3x3>       _func_scharr;
-    std::vector<NELKTrackerKernel> _kernel_tracker;
-    std::vector<Tensor>            _scharr_gx;
-    std::vector<Tensor>            _scharr_gy;
-    IKeyPointArray                *_new_points;
-    const IKeyPointArray          *_new_points_estimates;
-    const IKeyPointArray          *_old_points;
-    LKInternalKeypointArray        _new_points_internal;
-    LKInternalKeypointArray        _old_points_internal;
-    unsigned int                   _num_levels;
+    MemoryGroup                                     _memory_group;
+    std::vector<NEScharr3x3>                        _func_scharr;
+    std::vector<std::unique_ptr<NELKTrackerKernel>> _kernel_tracker;
+    std::vector<Tensor>                             _scharr_gx;
+    std::vector<Tensor>                             _scharr_gy;
+    IKeyPointArray                                 *_new_points;
+    const IKeyPointArray                           *_new_points_estimates;
+    const IKeyPointArray                           *_old_points;
+    LKInternalKeypointArray                         _new_points_internal;
+    LKInternalKeypointArray                         _old_points_internal;
+    unsigned int                                    _num_levels;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEOPTICALFLOW_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPReluLayer.h b/arm_compute/runtime/NEON/functions/NEPReluLayer.h
index 756058b5ec..358e633000 100644
--- a/arm_compute/runtime/NEON/functions/NEPReluLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPReluLayer.h
@@ -31,6 +31,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 namespace experimental
 {
diff --git a/arm_compute/runtime/NEON/functions/NEPadLayer.h b/arm_compute/runtime/NEON/functions/NEPadLayer.h
index fcb7c36312..3fdbb0d73c 100644
--- a/arm_compute/runtime/NEON/functions/NEPadLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPadLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,13 +29,15 @@
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 #include "arm_compute/runtime/SubTensor.h"
 
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPadLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Tensor.h"
+#include <memory>
 
 namespace arm_compute
 {
+class NECopyKernel;
+class NEPadLayerKernel;
+
 /** Basic function to pad a tensor. This function calls the following NEON functions/kernels:
  *
  *  - For padding mode = PaddingMode::CONSTANT:
@@ -49,8 +51,18 @@ namespace arm_compute
 class NEPadLayer : public IFunction
 {
 public:
-    /** Default constructor*/
+    /** Default Constructor */
     NEPadLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPadLayer(const NEPadLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPadLayer &operator=(const NEPadLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPadLayer(NEPadLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPadLayer &operator=(NEPadLayer &&) = delete;
+    /** Default destructor */
+    ~NEPadLayer();
     /** Initialize the function
      *
      * @param[in]  input          Source tensor. Data types supported: All.
@@ -97,15 +109,15 @@ class NEPadLayer : public IFunction
     void configure_reflect_symmetric_mode(ITensor *input, ITensor *output);
 
 private:
-    NECopyKernel                    _copy_kernel;
-    NEPadLayerKernel                _pad_kernel;
-    PaddingMode                     _mode;
-    PaddingList                     _padding;
-    uint32_t                        _num_dimensions;
-    std::vector<NEStridedSlice>     _slice_functions;
-    std::vector<NEConcatenateLayer> _concat_functions;
-    std::vector<Tensor>             _slice_results;
-    std::vector<Tensor>             _concat_results;
+    std::unique_ptr<NECopyKernel>     _copy_kernel;
+    std::unique_ptr<NEPadLayerKernel> _pad_kernel;
+    PaddingMode                       _mode;
+    PaddingList                       _padding;
+    uint32_t                          _num_dimensions;
+    std::vector<NEStridedSlice>       _slice_functions;
+    std::vector<NEConcatenateLayer>   _concat_functions;
+    std::vector<Tensor>               _slice_results;
+    std::vector<Tensor>               _concat_results;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEPADLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPermute.h b/arm_compute/runtime/NEON/functions/NEPermute.h
index 3be42c8346..ef8854b360 100644
--- a/arm_compute/runtime/NEON/functions/NEPermute.h
+++ b/arm_compute/runtime/NEON/functions/NEPermute.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,7 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEPermuteKernel */
 class NEPermute : public INESimpleFunctionNoBorder
diff --git a/arm_compute/runtime/NEON/functions/NEPhase.h b/arm_compute/runtime/NEON/functions/NEPhase.h
index c492073e22..1202f1878d 100644
--- a/arm_compute/runtime/NEON/functions/NEPhase.h
+++ b/arm_compute/runtime/NEON/functions/NEPhase.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,19 @@
 #ifndef ARM_COMPUTE_NEPHASE_H
 #define ARM_COMPUTE_NEPHASE_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEMagnitudePhaseKernel */
+/** Basic function to run @ref NEMagnitudePhaseKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class NEPhase : public INESimpleFunctionNoBorder
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 3c1aa5220c..91cf44ff2e 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -31,6 +31,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 namespace experimental
 {
@@ -42,29 +43,30 @@ class NEPixelWiseMultiplication : public INEOperator
      *
      * Valid configurations (Input1,Input2) -> Output :
      *
-     *   - (U8,U8)                         -> U8
-     *   - (U8,U8)                         -> S16
-     *   - (U8,S16)                        -> S16
-     *   - (S16,U8)                        -> S16
-     *   - (S16,S16)                       -> S16
-     *   - (F16,F16)                       -> F16
-     *   - (F32,F32)                       -> F32
-     *   - (QASYMM8,QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16)               -> QSYMM16
-     *   - (QSYMM16,QSYMM16)               -> S32
+     *                                                       Support: Broadcast? Scale=1/255?
+     *   - (U8,U8)                         -> U8, S16                 N          Y
+     *   - (U8,S16)                        -> S16                     N          Y
+     *   - (S16,U8)                        -> S16                     N          Y
+     *   - (S16,S16)                       -> S16                     N          Y
+     *   - (S32,S32)                       -> S32                     Y          N
+     *   - (F16,F16)                       -> F16                     N          Y
+     *   - (F32,F32)                       -> F32                     Y          Y
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8                 Y          Y
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED          Y          Y
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16, S32            N          Y
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in, out] input1          First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in, out] input1          First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
      *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in, out] input2          Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
      *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[out]     output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      * @param[in]      scale           Scale to apply after multiplication.
      *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+     *                                 If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+     * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
      * @param[in]      rounding_policy Rounding policy.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
@@ -74,27 +76,28 @@ class NEPixelWiseMultiplication : public INEOperator
      *
      * Valid configurations (Input1,Input2) -> Output :
      *
-     *   - (U8,U8)                         -> U8
-     *   - (U8,U8)                         -> S16
-     *   - (U8,S16)                        -> S16
-     *   - (S16,U8)                        -> S16
-     *   - (S16,S16)                       -> S16
-     *   - (F16,F16)                       -> F16
-     *   - (F32,F32)                       -> F32
-     *   - (QASYMM8,QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16)               -> QSYMM16
-     *   - (QSYMM16,QSYMM16)               -> S32
+     *                                                       Support: Broadcast? Scale=1/255?
+     *   - (U8,U8)                         -> U8, S16                 N          Y
+     *   - (U8,S16)                        -> S16                     N          Y
+     *   - (S16,U8)                        -> S16                     N          Y
+     *   - (S16,S16)                       -> S16                     N          Y
+     *   - (S32,S32)                       -> S32                     Y          N
+     *   - (F16,F16)                       -> F16                     N          Y
+     *   - (F32,F32)                       -> F32                     Y          Y
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8                 Y          Y
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED          Y          Y
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16, S32            N          Y
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in] input1          First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] input2          Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in] input1          First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in] input2          Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
      * @param[in] output          Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+     *                            If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
      * @param[in] rounding_policy Rounding policy.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      *
@@ -150,9 +153,9 @@ class NEPixelWiseMultiplication : public IFunction
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
      *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[in, out] input2          An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
      *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[out]     output          Output tensor. Data types supported:
      *                                 - U8, only if both inputs are U8.
@@ -160,12 +163,13 @@ class NEPixelWiseMultiplication : public IFunction
      *                                 - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
      *                                 - S16.
      *                                 - QSYMM16, only if both inputs are QSYMM16.
-     *                                 - S32, only if both inputs are QSYMM16.
+     *                                 - S32, only if both inputs are S32 or both are QSYMM16.
      *                                 - F16, only if @p input1 is F16.
      *                                 - F32, only if both inputs are F32.
      * @param[in]      scale           Scale to apply after multiplication.
      *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+     *                                 If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+     * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
      * @param[in]      rounding_policy Rounding policy.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
@@ -176,20 +180,21 @@ class NEPixelWiseMultiplication : public IFunction
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] input2          An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in] input2          An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
      * @param[in] output          Output tensor info. Data types supported:
      *                            - U8, only if both inputs are U8.
      *                            - QASYMM8, only if both inputs are QASYMM8.
      *                            - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
      *                            - S16.
      *                            - QSYMM16, only if both inputs are QSYMM16.
-     *                            - S32, only if both inputs are QSYMM16.
+     *                            - S32, only if both inputs are S32 or both are QSYMM16.
      *                            - F16, only if @p input1 is F16.
      *                            - F32, only if both inputs are F32.
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+     *                            If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
      * @param[in] rounding_policy Rounding policy.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      *
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 000c754ec8..b45290fb46 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -26,13 +26,15 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
+class NEPoolingLayerKernel;
+class NEFillBorderKernel;
 
 /** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following NEON kernels:
  *
@@ -44,6 +46,16 @@ class NEPoolingLayer : public IFunction
 public:
     /** Constructor */
     NEPoolingLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPoolingLayer(const NEPoolingLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPoolingLayer &operator=(const NEPoolingLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPoolingLayer(NEPoolingLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPoolingLayer &operator=(NEPoolingLayer &&) = delete;
+    /** Default destructor */
+    ~NEPoolingLayer();
     /** Set the input and output tensors.
      *
      * @note F16 is supported for pool sizes 2 and 3 only
@@ -71,10 +83,10 @@ class NEPoolingLayer : public IFunction
     void run() override;
 
 private:
-    NEPoolingLayerKernel _pooling_layer_kernel;
-    NEFillBorderKernel   _border_handler;
-    bool                 _is_global_pooling_layer;
-    DataLayout           _data_layout;
+    std::unique_ptr<NEPoolingLayerKernel> _pooling_layer_kernel;
+    std::unique_ptr<NEFillBorderKernel>   _border_handler;
+    bool                                  _is_global_pooling_layer;
+    DataLayout                            _data_layout;
 };
 }
 #endif /* ARM_COMPUTE_NEPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
index d4bb42fd07..3cc79fa28e 100644
--- a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,13 @@
 #ifndef ARM_COMPUTE_NEPRIORBOXLAYER_H
 #define ARM_COMPUTE_NEPRIORBOXLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEPriorBoxLayerKernel. */
 class NEPriorBoxLayer : public INESimpleFunctionNoBorder
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index 59dd567987..fcabc1d0c4 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -24,24 +24,27 @@
 #ifndef ARM_COMPUTE_NEQLSTMLAYER_H
 #define ARM_COMPUTE_NEQLSTMLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NECopy.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
+#include "support/MemorySupport.h"
 
 #include "arm_compute/runtime/common/LSTMParams.h"
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
+class NEQLSTMLayerNormalizationKernel;
+class NEGEMMLowpMatrixAReductionKernel;
 
 /** Basic function to run @ref NEQLSTMLayer
  *
@@ -64,12 +67,14 @@ class NEQLSTMLayer : public IFunction
     NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEQLSTMLayer(const NEQLSTMLayer &) = delete;
-    /** Default move constructor */
-    NEQLSTMLayer(NEQLSTMLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEQLSTMLayer(NEQLSTMLayer &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEQLSTMLayer &operator=(const NEQLSTMLayer &) = delete;
-    /** Default move assignment operator */
-    NEQLSTMLayer &operator=(NEQLSTMLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEQLSTMLayer &operator=(NEQLSTMLayer &&) = delete;
+    /** Default destructor */
+    ~NEQLSTMLayer();
     /** Initialize function's tensors.
      *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED.
@@ -115,7 +120,7 @@ class NEQLSTMLayer : public IFunction
                    const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
                    const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
                    const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   const ITensor *cell_state_in, const ITensor *output_state_in,
+                   const ITensor *cell_state_in, ITensor *output_state_in,
                    ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
                    const LSTMParams<ITensor> &lstm_params);
 
@@ -204,7 +209,7 @@ class NEQLSTMLayer : public IFunction
                       Tensor *outstage_res, float gemmlowp_scale,
                       const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
 
-    MemoryGroup _memory_group{};
+    MemoryGroup _memory_group;
 
     /** A small internel kernel do the copy between two tensors */
     class TensorCopyKernel
@@ -217,6 +222,8 @@ class NEQLSTMLayer : public IFunction
         Window   _window{};
 
     public:
+        /** Destructor */
+        ~TensorCopyKernel();
         /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayer::TensorCopyKernel
          *
          * @param[in] src Source tensor info.
@@ -236,79 +243,79 @@ class NEQLSTMLayer : public IFunction
     };
 
     // Functions used
-    NETranspose                      _transpose_input_to_forget_weights{};
-    NETranspose                      _transpose_input_to_cell_weights{};
-    NETranspose                      _transpose_input_to_output_weights{};
-    NETranspose                      _transpose_input_to_input_weights{};
-    NETranspose                      _transpose_recurrent_to_forget_weights{};
-    NETranspose                      _transpose_recurrent_to_cell_weights{};
-    NETranspose                      _transpose_recurrent_to_output_weights{};
-    NETranspose                      _transpose_recurrent_to_input_weights{};
-    NETranspose                      _transpose_projection_weights{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_input_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_input_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_forget_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_forget_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_cell_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_cell_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_output_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _projection_reduction{};
-    NEArithmeticAddition             _projection_bias_add{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_forget{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_forget{};
-    NEPixelWiseMultiplication        _pixelwise_mul_cell_to_forget{};
-    NEGEMMLowpOutputStage            _input_to_forget_outstage{};
-    NEGEMMLowpOutputStage            _recurrent_to_forget_outstage{};
-    NEGEMMLowpOutputStage            _cell_to_forget_outstage{};
-    NEArithmeticAddition             _accumulate_input_recurrent_forget{};
-    NEArithmeticAddition             _accumulate_cell_forget{};
-    NEActivationLayer                _forget_gate_sigmoid{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_cell{};
-    NEGEMMLowpOutputStage            _input_to_cell_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_cell{};
-    NEGEMMLowpOutputStage            _recurrent_to_cell_outstage{};
-    NEArithmeticAddition             _accumulate_input_recurrent_modulation{};
-    NEActivationLayer                _cell_gate_tanh{};
-    NEArithmeticSubtraction          _input_gate_sub{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_input{};
-    NEGEMMLowpOutputStage            _input_to_input_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_input{};
-    NEGEMMLowpOutputStage            _recurrent_to_input_outstage{};
-    NEArithmeticAddition             _accumulate_input_recurrent_input{};
-    NEPixelWiseMultiplication        _pixelwise_mul_cell_to_input{};
-    NEGEMMLowpOutputStage            _cell_to_input_outstage{};
-    NEArithmeticAddition             _accumulate_cell_input{};
-    NEActivationLayer                _input_gate_sigmoid{};
-    NEPixelWiseMultiplication        _pixelwise_mul_forget_cell{};
-    NEPixelWiseMultiplication        _pixelwise_mul_input_cell{};
-    NEArithmeticAddition             _add_forget_cell{};
-    NEActivationLayer                _cell_clip{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_output{};
-    NEGEMMLowpOutputStage            _input_to_output_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_output{};
-    NEGEMMLowpOutputStage            _recurrent_to_output_outstage{};
-    NEArithmeticAddition             _accumulate_input_recurrent_output{};
-    NEPixelWiseMultiplication        _pixelwise_mul_cell_to_output{};
-    NEGEMMLowpOutputStage            _cell_to_output_outstage{};
-    NEArithmeticAddition             _accumulate_cell_to_output{};
-    NEActivationLayer                _output_gate_sigmoid{};
-    NEActivationLayer                _hidden_tanh{};
-    NEPixelWiseMultiplication        _pixelwise_mul_hidden{};
-    NEGEMMLowpOutputStage            _hidden_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_projection{};
-    NEGEMMLowpOutputStage            _projection_outstage{};
-    NEArithmeticAddition             _accumulate_projection{};
-    NEActivationLayer                _projection_clip{};
+    NETranspose                                       _transpose_input_to_forget_weights;
+    NETranspose                                       _transpose_input_to_cell_weights;
+    NETranspose                                       _transpose_input_to_output_weights;
+    NETranspose                                       _transpose_input_to_input_weights;
+    NETranspose                                       _transpose_recurrent_to_forget_weights;
+    NETranspose                                       _transpose_recurrent_to_cell_weights;
+    NETranspose                                       _transpose_recurrent_to_output_weights;
+    NETranspose                                       _transpose_recurrent_to_input_weights;
+    NETranspose                                       _transpose_projection_weights;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_input_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_output_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
+    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _projection_reduction;
+    NEArithmeticAddition                              _projection_bias_add;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_input_to_forget;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_forget;
+    NEPixelWiseMultiplication                         _pixelwise_mul_cell_to_forget;
+    NEGEMMLowpOutputStage                             _input_to_forget_outstage;
+    NEGEMMLowpOutputStage                             _recurrent_to_forget_outstage;
+    NEGEMMLowpOutputStage                             _cell_to_forget_outstage;
+    NEArithmeticAddition                              _accumulate_input_recurrent_forget;
+    NEArithmeticAddition                              _accumulate_cell_forget;
+    NEActivationLayer                                 _forget_gate_sigmoid;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_input_to_cell;
+    NEGEMMLowpOutputStage                             _input_to_cell_outstage;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_cell;
+    NEGEMMLowpOutputStage                             _recurrent_to_cell_outstage;
+    NEArithmeticAddition                              _accumulate_input_recurrent_modulation;
+    NEActivationLayer                                 _cell_gate_tanh;
+    NEArithmeticSubtraction                           _input_gate_sub;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_input_to_input;
+    NEGEMMLowpOutputStage                             _input_to_input_outstage;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_input;
+    NEGEMMLowpOutputStage                             _recurrent_to_input_outstage;
+    NEArithmeticAddition                              _accumulate_input_recurrent_input;
+    NEPixelWiseMultiplication                         _pixelwise_mul_cell_to_input;
+    NEGEMMLowpOutputStage                             _cell_to_input_outstage;
+    NEArithmeticAddition                              _accumulate_cell_input;
+    NEActivationLayer                                 _input_gate_sigmoid;
+    NEPixelWiseMultiplication                         _pixelwise_mul_forget_cell;
+    NEPixelWiseMultiplication                         _pixelwise_mul_input_cell;
+    NEArithmeticAddition                              _add_forget_cell;
+    NEActivationLayer                                 _cell_clip;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_input_to_output;
+    NEGEMMLowpOutputStage                             _input_to_output_outstage;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_recurrent_to_output;
+    NEGEMMLowpOutputStage                             _recurrent_to_output_outstage;
+    NEArithmeticAddition                              _accumulate_input_recurrent_output;
+    NEPixelWiseMultiplication                         _pixelwise_mul_cell_to_output;
+    NEGEMMLowpOutputStage                             _cell_to_output_outstage;
+    NEArithmeticAddition                              _accumulate_cell_to_output;
+    NEActivationLayer                                 _output_gate_sigmoid;
+    NEActivationLayer                                 _hidden_tanh;
+    NEPixelWiseMultiplication                         _pixelwise_mul_hidden;
+    NEGEMMLowpOutputStage                             _hidden_outstage;
+    NEGEMMLowpMatrixMultiplyCore                      _mm_projection;
+    NEGEMMLowpOutputStage                             _projection_outstage;
+    NEArithmeticAddition                              _accumulate_projection;
+    NEActivationLayer                                 _projection_clip;
 
-    TensorCopyKernel _projection_bias_copy{};
-    TensorCopyKernel _projection_output_to_accumulate_copy{};
-    TensorCopyKernel _projection_accumulate_to_output_copy{};
-    TensorCopyKernel _hidden_to_output_copy{};
+    TensorCopyKernel _projection_bias_copy;
+    TensorCopyKernel _projection_output_to_accumulate_copy;
+    TensorCopyKernel _projection_accumulate_to_output_copy;
+    TensorCopyKernel _hidden_to_output_copy;
 
-    std::array<NEQLSTMLayerNormalizationKernel, _layer_norm_count> _layer_norms{ {} };
+    std::array<std::unique_ptr<NEQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
 
-    NECopyKernel _copy_output{};
+    NECopy _copy_output;
 
     // Tensor pointers
     const ITensor *_input_to_input_weights
@@ -324,8 +331,8 @@ class NEQLSTMLayer : public IFunction
     const ITensor *_recurrent_to_cell_weights{ nullptr };
     const ITensor *_recurrent_to_output_weights{ nullptr };
     const ITensor *_projection_weights{ nullptr };
-    std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{ {} };
-    std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{ {} };
+    std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{};
+    std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{};
 
     using LayerNormIndexType = typename std::underlying_type<LayerNormGate>::type;
     inline LayerNormIndexType getGateIndex(LayerNormGate g)
@@ -353,32 +360,13 @@ class NEQLSTMLayer : public IFunction
         return _layer_norm_bias[getGateIndex(g)];
     }
 
-    inline NEQLSTMLayerNormalizationKernel &get_layer_norm(LayerNormGate g)
+    inline std::unique_ptr<NEQLSTMLayerNormalizationKernel> &get_layer_norm(LayerNormGate g)
     {
         return _layer_norms[getGateIndex(g)];
     }
 
-    inline void configure_layer_norm(LayerNormGate g, const ITensor *in)
-    {
-        ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
-
-        Tensor &out = get_layer_norm_output(g);
-        _memory_group.manage(&out);
-        out.allocator()->init(*(in->info()));
-
-        get_layer_norm(g).configure(in, &out, get_layer_norm_weight(g), get_layer_norm_bias(g));
-    }
-
-    inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
-    {
-        // Output quantization scale will be different, but ignored here
-        // since it will be configured at configure() stage.
-        const TensorInfo out
-        {
-            in
-        };
-        return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
-    }
+    void configure_layer_norm(LayerNormGate g, const ITensor *in);
+    static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
 
     // Temporary tensors
     Tensor _input_to_forget_weights_transposed{ nullptr };
@@ -434,7 +422,7 @@ class NEQLSTMLayer : public IFunction
     Tensor _projection_out_res{ nullptr };
     Tensor _projection_accumulate_res{ nullptr };
     Tensor _ones{ nullptr };
-    std::array<Tensor, _layer_norm_count> _layer_norm_output{ {} };
+    std::array<Tensor, _layer_norm_count> _layer_norm_output{};
 
     inline Tensor &get_layer_norm_output(LayerNormGate g)
     {
diff --git a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
index 266b3df87a..36302f4741 100644
--- a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 #include "arm_compute/core/Types.h"
@@ -34,6 +33,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to simulate a quantization layer. This function calls the following NEON kernels:
  *
@@ -44,8 +44,6 @@ class ITensor;
 class NEQuantizationLayer : public INESimpleFunctionNoBorder
 {
 public:
-    /** Default constructor */
-    NEQuantizationLayer() = default;
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h
index 12e3ef9c57..c42b303a89 100644
--- a/arm_compute/runtime/NEON/functions/NERNNLayer.h
+++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_NERNNLAYER_H
 #define ARM_COMPUTE_NERNNLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
@@ -36,6 +34,7 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class NECopyKernel;
 
 /** Basic function to run @ref NERNNLayer */
 class NERNNLayer : public IFunction
@@ -45,12 +44,14 @@ class NERNNLayer : public IFunction
     NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NERNNLayer(const NERNNLayer &) = delete;
-    /** Default move constructor */
-    NERNNLayer(NERNNLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NERNNLayer(NERNNLayer &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NERNNLayer &operator=(const NERNNLayer &) = delete;
-    /** Default move assignment operator */
-    NERNNLayer &operator=(NERNNLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NERNNLayer &operator=(NERNNLayer &&) = delete;
+    /** Default destructor */
+    ~NERNNLayer();
     /** Initialize the function
      *
      * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
@@ -82,16 +83,16 @@ class NERNNLayer : public IFunction
     void prepare() override;
 
 private:
-    MemoryGroup           _memory_group;
-    NEGEMM                _gemm_state_f;
-    NEArithmeticAddition  _add_f;
-    NEActivationLayer     _activation;
-    NEFullyConnectedLayer _fully_connected;
-    NECopyKernel          _copy_kernel;
-    Tensor                _fully_connected_out;
-    Tensor                _gemm_output;
-    Tensor                _add_output;
-    bool                  _is_prepared;
+    MemoryGroup                   _memory_group;
+    NEGEMM                        _gemm_state_f;
+    NEArithmeticAddition          _add_f;
+    NEActivationLayer             _activation;
+    NEFullyConnectedLayer         _fully_connected;
+    std::unique_ptr<NECopyKernel> _copy_kernel;
+    Tensor                        _fully_connected_out;
+    Tensor                        _gemm_output;
+    Tensor                        _add_output;
+    bool                          _is_prepared;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NERNNLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
index 3e8db55f99..ea3be18932 100644
--- a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
@@ -24,11 +24,13 @@
 #ifndef ARM_COMPUTE_NEROIALIGNLAYER_H
 #define ARM_COMPUTE_NEROIALIGNLAYER_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEROIAlignLayerKernel.
  *
@@ -36,12 +38,12 @@ class ITensor;
  * -# @ref NEROIAlignLayerKernel
  *
  */
-class NEROIAlignLayer : public INESimpleFunction
+class NEROIAlignLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
      *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
      *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8, otherwise same as @p input
@@ -56,7 +58,7 @@ class NEROIAlignLayer : public INESimpleFunction
     void configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEROIAlignLayerKernel
      *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8,
      *                      otherwise same as @p input
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
index 08885d0e58..0b9b4f75fc 100644
--- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,11 +27,13 @@
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class NEROIPoolingLayerKernel;
+class ROIPoolingLayerInfo;
 
 /** Basic function to run @ref NEROIPoolingLayerKernel.
  *
@@ -44,6 +46,16 @@ class NEROIPoolingLayer : public IFunction
 public:
     /** Constructor */
     NEROIPoolingLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEROIPoolingLayer(const NEROIPoolingLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEROIPoolingLayer &operator=(const NEROIPoolingLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEROIPoolingLayer(NEROIPoolingLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEROIPoolingLayer &operator=(NEROIPoolingLayer &&) = delete;
+    /** Default destructor */
+    ~NEROIPoolingLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. Data types supported: F32.
@@ -63,7 +75,7 @@ class NEROIPoolingLayer : public IFunction
     void run() override;
 
 private:
-    NEROIPoolingLayerKernel _roi_kernel;
+    std::unique_ptr<NEROIPoolingLayerKernel> _roi_kernel;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEROIPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NERange.h b/arm_compute/runtime/NEON/functions/NERange.h
index 04889d4d6f..28976001d7 100644
--- a/arm_compute/runtime/NEON/functions/NERange.h
+++ b/arm_compute/runtime/NEON/functions/NERange.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,24 +24,37 @@
 #ifndef ARM_COMPUTE_NERANGE_H
 #define ARM_COMPUTE_NERANGE_H
 
-#include "arm_compute/core/NEON/kernels/NERangeKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
+class NERangeKernel;
 
 /** Basic function to run @ref NERangeKernel
  *
  * @note The tensor data type for the output must be U8/S8/U16/S16/U32/S32/F16/F32.
  * @note The function performs generates a sequence with the given start, end and step.
+ *
  */
 class NERange : public IFunction
 {
 public:
     /** Default constructor */
     NERange();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERange(const NERange &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERange &operator=(const NERange &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NERange(NERange &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NERange &operator=(NERange &&) = delete;
+    /** Default destructor */
+    ~NERange();
     /** Initialize the kernel's start, end, step and output tensor.
      *
      * @param[out] output Output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
@@ -65,7 +78,7 @@ class NERange : public IFunction
     void run() override;
 
 private:
-    NERangeKernel _kernel;
+    std::unique_ptr<NERangeKernel> _kernel;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NERANGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
index a1b6e348df..89cd09812b 100644
--- a/arm_compute/runtime/NEON/functions/NEReduceMean.h
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h
@@ -26,9 +26,10 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -41,6 +42,16 @@ class NEReduceMean : public IFunction
 public:
     /** Constructor */
     NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReduceMean(const NEReduceMean &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReduceMean &operator=(const NEReduceMean &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEReduceMean(NEReduceMean &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEReduceMean &operator=(NEReduceMean &&) = delete;
+    /** Default destructor */
+    ~NEReduceMean();
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
@@ -71,8 +82,13 @@ class NEReduceMean : public IFunction
     std::vector<NEReductionOperation> _reduction_kernels;
     std::vector<Tensor>               _reduced_outs;
     NEReshapeLayer                    _reshape;
+    NEDequantizationLayer             _dequant;
+    NEQuantizationLayer               _requant;
     int                               _reduction_ops;
     bool                              _keep_dims;
+    bool                              _do_requant;
+    Tensor                            _input_no_quant;
+    Tensor                            _output_no_quant;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEON_REDUCE_MEAN_H */
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index ab6928b241..8186e2e355 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -26,18 +26,18 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class NEReductionOperationKernel;
 
 /** Basic function to simulate a reduction operation. This function calls the following NEON kernels:
  *
- * -# @ref NEFillBorderKernel
+ * -# @ref NEReshapeLayer
  * -# @ref NEReductionOperationKernel
  *
  */
@@ -46,6 +46,16 @@ class NEReductionOperation : public IFunction
 public:
     /** Default constructor */
     NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReductionOperation(const NEReductionOperation &) = delete;
+    /** Default move constructor */
+    NEReductionOperation(NEReductionOperation &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReductionOperation &operator=(const NEReductionOperation &) = delete;
+    /** Default move assignment operator */
+    NEReductionOperation &operator=(NEReductionOperation &&) = default;
+    /** Default destructor */
+    ~NEReductionOperation();
     /** Set the input and output tensors.
      *
      * @param[in, out] input     Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW. (Written to only for border_size != 0)
@@ -72,14 +82,13 @@ class NEReductionOperation : public IFunction
     void run() override;
 
 private:
-    MemoryGroup                _memory_group;
-    NEReductionOperationKernel _reduction_kernel;
-    NEFillBorderKernel         _fill_border_kernel;
-    NEReshapeLayer             _reshape;
-    Tensor                     _output_internal;
-    size_t                     _window_split;
-    int                        _reduction_axis;
-    bool                       _is_reshape_required;
+    MemoryGroup                                 _memory_group;
+    std::unique_ptr<NEReductionOperationKernel> _reduction_kernel;
+    NEReshapeLayer                              _reshape;
+    Tensor                                      _output_internal;
+    size_t                                      _window_split;
+    int                                         _reduction_axis;
+    bool                                        _is_reshape_required;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEREDUCTIONOPERATION_H */
diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h
index f087bd2e3c..86f366a697 100644
--- a/arm_compute/runtime/NEON/functions/NERemap.h
+++ b/arm_compute/runtime/NEON/functions/NERemap.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,9 @@ class ITensor;
  *
  * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref NERemapKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NERemap : public INESimpleFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NEReorgLayer.h b/arm_compute/runtime/NEON/functions/NEReorgLayer.h
index 19385e1b74..f76d1d252c 100644
--- a/arm_compute/runtime/NEON/functions/NEReorgLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReorgLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,7 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEReorgLayerKernel */
 class NEReorgLayer : public INESimpleFunctionNoBorder
diff --git a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
index 2ca6660139..641a96e0f9 100644
--- a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NERESHAPELAYER_H
 #define ARM_COMPUTE_NERESHAPELAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/INEOperator.h"
@@ -81,6 +80,18 @@ namespace experimental
 class NEReshape : public INEOperator
 {
 public:
+    /** Default Constructor */
+    NEReshape() = default;
+    /** Default Destructor */
+    ~NEReshape();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReshape(const NEReshape &) = delete;
+    /** Default move constructor */
+    NEReshape(NEReshapeLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReshape &operator=(const NEReshape &) = delete;
+    /** Default move assignment operator */
+    NEReshape &operator=(NEReshape &&);
     /** Initialise the kernel's inputs and outputs
      *
      * @param[in]  input  Input tensor info. Data type supported: All
diff --git a/arm_compute/runtime/NEON/functions/NEReverse.h b/arm_compute/runtime/NEON/functions/NEReverse.h
index 7a4566db28..2048dafcb5 100644
--- a/arm_compute/runtime/NEON/functions/NEReverse.h
+++ b/arm_compute/runtime/NEON/functions/NEReverse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEReverseKernel */
 class NEReverse : public INESimpleFunctionNoBorder
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index f149e3bbb9..fceda83510 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -24,20 +24,17 @@
 #ifndef ARM_COMPUTE_NESCALEIMAGE_H
 #define ARM_COMPUTE_NESCALEIMAGE_H
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 #include "arm_compute/runtime/Tensor.h"
 
-#include <cstdint>
-
 namespace arm_compute
 {
 class ITensor;
 
 /** Basic function to run @ref NEScaleKernel */
-class NEScale : public IFunction
+class NEScale : public INESimpleFunctionNoBorder
 {
 public:
     /** Constructor
@@ -45,20 +42,6 @@ class NEScale : public IFunction
      * Initialize NEScale
      */
     NEScale();
-    /** Initialize the function's source, destination, interpolation type and border_mode.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]      policy                The interpolation type.
-     * @param[in]      border_mode           Strategy to use for borders.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]      sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]      use_padding           (Optional) Is padding in use or not. Defaults to true.
-     * @param[in]      align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.08)
-    void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
-                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
      * @param[in, out] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
@@ -66,22 +49,6 @@ class NEScale : public IFunction
      * @param[in]      info   @ref ScaleKernelInfo to be used for configuration
      */
     void configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEScale
-     *
-     * @param[in] input                 Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in] output                Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] policy                The interpolation type.
-     * @param[in] border_mode           Strategy to use for borders.
-     * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in] sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in] use_padding           (Optional) Is padding in use or not. Defaults to true.
-     * @param[in] align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
-     *
-     * @return a status
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.08)
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode,
-                           PixelValue constant_border_value = PixelValue(), SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
     /** Static function to check if given info will lead to a valid configuration of @ref NEScale
      *
      * @param[in] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
@@ -92,16 +59,10 @@ class NEScale : public IFunction
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info);
 
-    // Inherited methods overridden:
-    void run() override;
-
 private:
-    Tensor             _offsets;        /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
-    Tensor             _dx;             /**< Element's distance between the X real coordinate and the smallest X following integer */
-    Tensor             _dy;             /**< Element's distance between the Y real coordinate and the smallest Y following integer */
-    NEScaleKernel      _scale_kernel;   /**< Kernel to perform the scaling */
-    NEFillBorderKernel _border_handler; /**< kernel to handle tensor borders */
-    bool               _use_padding;    /**< Is padding used on the tensors */
+    Tensor _offsets; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
+    Tensor _dx;      /**< Element's distance between the X real coordinate and the smallest X following integer */
+    Tensor _dy;      /**< Element's distance between the Y real coordinate and the smallest Y following integer */
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NESCALEIMAGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEScharr3x3.h b/arm_compute/runtime/NEON/functions/NEScharr3x3.h
index 0113104caf..8dd8a80287 100644
--- a/arm_compute/runtime/NEON/functions/NEScharr3x3.h
+++ b/arm_compute/runtime/NEON/functions/NEScharr3x3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,8 @@ class ITensor;
  * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref NEScharr3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NEScharr3x3 : public INESimpleFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NESelect.h b/arm_compute/runtime/NEON/functions/NESelect.h
index 258ac5d64d..c66fbfa7d4 100644
--- a/arm_compute/runtime/NEON/functions/NESelect.h
+++ b/arm_compute/runtime/NEON/functions/NESelect.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,15 +25,16 @@
 #define ARM_COMPUTE_NESELECT_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NESelect */
-class NESelect : public INESimpleFunction
+class NESelect : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialise the kernel's inputs and output.
diff --git a/arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h b/arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h
deleted file mode 100644
index a814802ead..0000000000
--- a/arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H
-#define ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H
-
-#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-/** Basic interface for functions which have a single NEON GEMM wrapper kernel to run */
-class NESimpleAssemblyFunction : public IFunction
-{
-public:
-    /** Constructor */
-    NESimpleAssemblyFunction();
-
-    /** Configure the function with the kernel to run
-     *
-     * @param[in] kernel GEMM Wrapper kernel configured and ready to run
-     *
-     * @note The kernel is expected to have a 1D window. The function will multi-thread this window across the X dimension.
-     */
-    void configure(std::unique_ptr<INEGEMMWrapperKernel> kernel);
-
-    // Inherited methods overridden:
-    void run() override final;
-
-protected:
-    std::unique_ptr<INEGEMMWrapperKernel> _kernel; /**< Kernel to run */
-};
-} //namespace arm_compute
-#endif /*ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H */
diff --git a/arm_compute/runtime/NEON/functions/NESobel3x3.h b/arm_compute/runtime/NEON/functions/NESobel3x3.h
index 4dbdfd223b..89a2e07570 100644
--- a/arm_compute/runtime/NEON/functions/NESobel3x3.h
+++ b/arm_compute/runtime/NEON/functions/NESobel3x3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,8 @@ class ITensor;
  * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
  * -# @ref NESobel3x3Kernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NESobel3x3 : public INESimpleFunction
 {
diff --git a/arm_compute/runtime/NEON/functions/NESobel5x5.h b/arm_compute/runtime/NEON/functions/NESobel5x5.h
index b5365bc1b7..79e653b395 100644
--- a/arm_compute/runtime/NEON/functions/NESobel5x5.h
+++ b/arm_compute/runtime/NEON/functions/NESobel5x5.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_NESOBEL5x5_H
 #define ARM_COMPUTE_NESOBEL5x5_H
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -38,6 +36,9 @@
 namespace arm_compute
 {
 class ITensor;
+class NESobel5x5HorKernel;
+class NESobel5x5VertKernel;
+class NEFillBorderKernel;
 
 /** Basic function to execute sobel 5x5 filter. This function calls the following NEON kernels:
  *
@@ -45,12 +46,24 @@ class ITensor;
  * -# @ref NESobel5x5HorKernel
  * -# @ref NESobel5x5VertKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NESobel5x5 : public IFunction
 {
 public:
     /** Default constructor */
     NESobel5x5(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5(const NESobel5x5 &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5 &operator=(const NESobel5x5 &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NESobel5x5(NESobel5x5 &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NESobel5x5 &operator=(NESobel5x5 &&) = delete;
+    /** Default destructor */
+    ~NESobel5x5();
     /** Initialise the function's source, destinations and border mode.
      *
      * @note At least one of output_x or output_y must be not NULL.
@@ -68,12 +81,12 @@ class NESobel5x5 : public IFunction
     void run() override;
 
 protected:
-    MemoryGroup          _memory_group;   /**< Function memory group */
-    NESobel5x5HorKernel  _sobel_hor;      /**< Sobel Horizontal 5x5 kernel */
-    NESobel5x5VertKernel _sobel_vert;     /**< Sobel Vertical 5x5 kernel */
-    Tensor               _tmp_x;          /**< Temporary buffer for Sobel X */
-    Tensor               _tmp_y;          /**< Temporary buffer for Sobel Y */
-    NEFillBorderKernel   _border_handler; /**< Kernel to handle tensor borders */
+    MemoryGroup                           _memory_group;   /**< Function memory group */
+    std::unique_ptr<NESobel5x5HorKernel>  _sobel_hor;      /**< Sobel Horizontal 5x5 kernel */
+    std::unique_ptr<NESobel5x5VertKernel> _sobel_vert;     /**< Sobel Vertical 5x5 kernel */
+    Tensor                                _tmp_x;          /**< Temporary buffer for Sobel X */
+    Tensor                                _tmp_y;          /**< Temporary buffer for Sobel Y */
+    std::unique_ptr<NEFillBorderKernel>   _border_handler; /**< Kernel to handle tensor borders */
 };
 }
 #endif /*ARM_COMPUTE_NESOBEL5x5_H */
diff --git a/arm_compute/runtime/NEON/functions/NESobel7x7.h b/arm_compute/runtime/NEON/functions/NESobel7x7.h
index 925444d85b..7395bb0198 100644
--- a/arm_compute/runtime/NEON/functions/NESobel7x7.h
+++ b/arm_compute/runtime/NEON/functions/NESobel7x7.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_NESOBEL7x7_H
 #define ARM_COMPUTE_NESOBEL7x7_H
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -38,6 +36,9 @@
 namespace arm_compute
 {
 class ITensor;
+class NESobel7x7HorKernel;
+class NESobel7x7VertKernel;
+class NEFillBorderKernel;
 
 /** Basic function to execute sobel 7x7 filter. This function calls the following NEON kernels:
  *
@@ -45,12 +46,24 @@ class ITensor;
  * -# @ref NESobel7x7HorKernel
  * -# @ref NESobel7x7VertKernel
  *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
  */
 class NESobel7x7 : public IFunction
 {
 public:
     /** Default constructor */
     NESobel7x7(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7(const NESobel7x7 &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7 &operator=(const NESobel7x7 &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NESobel7x7(NESobel7x7 &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NESobel7x7 &operator=(NESobel7x7 &&) = delete;
+    /** Default destructor */
+    ~NESobel7x7();
     /** Initialise the function's source, destinations and border mode.
      *
      * @note At least one of output_x or output_y must be not NULL.
@@ -68,12 +81,12 @@ class NESobel7x7 : public IFunction
     void run() override;
 
 protected:
-    MemoryGroup          _memory_group;   /**< Function memory group */
-    NESobel7x7HorKernel  _sobel_hor;      /**< Sobel Horizontal 7x7 kernel */
-    NESobel7x7VertKernel _sobel_vert;     /**< Sobel Vertical 7x7 kernel */
-    Tensor               _tmp_x;          /**< Temporary buffer for Sobel X */
-    Tensor               _tmp_y;          /**< Temporary buffer for Sobel Y */
-    NEFillBorderKernel   _border_handler; /**< Kernel to handle tensor borders */
+    MemoryGroup                           _memory_group;   /**< Function memory group */
+    std::unique_ptr<NESobel7x7HorKernel>  _sobel_hor;      /**< Sobel Horizontal 7x7 kernel */
+    std::unique_ptr<NESobel7x7VertKernel> _sobel_vert;     /**< Sobel Vertical 7x7 kernel */
+    Tensor                                _tmp_x;          /**< Temporary buffer for Sobel X */
+    Tensor                                _tmp_y;          /**< Temporary buffer for Sobel Y */
+    std::unique_ptr<NEFillBorderKernel>   _border_handler; /**< Kernel to handle tensor borders */
 };
 }
 #endif /*ARM_COMPUTE_NESOBEL7x7_H */
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index 9fb4d85262..40fa38afde 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -24,17 +24,19 @@
 #ifndef ARM_COMPUTE_NESOFTMAXLAYER_H
 #define ARM_COMPUTE_NESOFTMAXLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
 #include "arm_compute/runtime/Tensor.h"
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class NELogits1DMaxKernel;
+template <bool IS_LOG>
+class NELogits1DSoftmaxKernel;
+class NEFillBorderKernel;
 
 /** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
  *
@@ -44,7 +46,9 @@ class ITensor;
  * Log Softmax is calculated by :
  * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
  *
- * This function runs the following kernels:
+ * This function runs the following function/kernels:
+ * -# If axis is not 0:
+ * -#   @ref NEPermute
  * -# @ref NEFillBorderKernel
  * -# @ref NELogits1DMaxKernel
  * -# @ref NELogits1DSoftmaxKernel
@@ -63,6 +67,8 @@ class NESoftmaxLayerGeneric : public IFunction
     NESoftmaxLayerGeneric &operator=(const NESoftmaxLayerGeneric &) = delete;
     /** Default move assignment operator */
     NESoftmaxLayerGeneric &operator=(NESoftmaxLayerGeneric &&) = default;
+    /** Default destructor */
+    ~NESoftmaxLayerGeneric();
     /** Set the input and output tensors.
      *
      * @param[in,out] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. If the width is not a
@@ -70,7 +76,8 @@ class NESoftmaxLayerGeneric : public IFunction
      *                       last value of each row to the nearest multiple.
      * @param[out]    output Destination tensor. Data types supported: same as @p input.
      * @param[in]     beta   (Optional) A scaling factor for the exponent.
-     * @param[in]     axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
+     * @param[in]     axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      */
     void configure(ITensor *input, ITensor *output, float beta = 1.0f, int32_t axis = 0);
     /** Static function to check if given info will lead to a valid configuration of @ref NESoftmaxLayer
@@ -78,7 +85,8 @@ class NESoftmaxLayerGeneric : public IFunction
      * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output Destination tensor info. Data types supported: same as @p input
      * @param[in] beta   (Optional) A scaling factor for the exponent.
-     * @param[in] axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
+     * @param[in] axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      *
      * @return a status
      */
@@ -88,30 +96,17 @@ class NESoftmaxLayerGeneric : public IFunction
     void run() override;
 
 private:
-    /** Utility method to configure the kernels needed to flatten the input
-     * tensor.
-     *
-     * @note This function changes the internal state of this class. In particular,
-     * it initializes the kernel @p _flatten_kernel and the tensors @p _input_flat and
-     * @p _output_flat
-     *
-     * @param[in] input  Original source tensor.
-     * @param[in] output Original destination tensor.
-     * @param[in] axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
-     */
-    void configure_reshape_input_kernel(const ITensor *input, const ITensor *output, int32_t axis);
-
-    MemoryGroup                     _memory_group;
-    NELogits1DMaxKernel             _max_kernel;
-    NELogits1DSoftmaxKernel<IS_LOG> _softmax_kernel;
-    std::unique_ptr<IFunction>      _flat_or_reshape_ptr;
-    NEFillBorderKernel              _fill_border_kernel;
-    NEReshapeLayer                  _reshape;
-    Tensor                          _max;
-    Tensor                          _tmp;
-    Tensor                          _input_flattened;
-    Tensor                          _output_flattened;
-    bool                            _needs_flattening;
+    MemoryGroup                                      _memory_group;
+    NEPermute                                        _permute_input;
+    NEPermute                                        _permute_output;
+    std::unique_ptr<NELogits1DMaxKernel>             _max_kernel;
+    std::unique_ptr<NELogits1DSoftmaxKernel<IS_LOG>> _softmax_kernel;
+    std::unique_ptr<NEFillBorderKernel>              _fill_border_kernel;
+    Tensor                                           _max;
+    Tensor                                           _tmp;
+    Tensor                                           _input_permuted;
+    Tensor                                           _output_permuted;
+    bool                                             _needs_permute;
 };
 
 using NESoftmaxLayer    = NESoftmaxLayerGeneric<false>;
diff --git a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
index 6f339e8d52..62af092c40 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,13 +26,15 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
+class NESpaceToBatchLayerKernel;
+class NEMemsetKernel;
 
 /** Basic function to spatial divide a tensor. This function calls the following NEON kernels/functions:
  *
@@ -53,12 +55,12 @@ class NESpaceToBatchLayer : public IFunction
     /** Allow instances of this class to be moved */
     NESpaceToBatchLayer &operator=(NESpaceToBatchLayer &&) = default;
     /** Default destructor */
-    virtual ~NESpaceToBatchLayer() = default;
+    ~NESpaceToBatchLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in]  block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in]  paddings    2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output      Tensor output. Data types supported: same as @p input
      */
     void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output);
@@ -67,16 +69,16 @@ class NESpaceToBatchLayer : public IFunction
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
-     * @param[in]  padding_left  The left padding of the output tensor.
-     * @param[in]  padding_right The right padding of the output tensor.
+     * @param[in]  padding_left  The padding at the beginning of every dimension of the output tensor.
+     * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
     void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32
-     * @param[in] paddings    paddings tensor info with shape [2, M]. Data types supported: S32
+     * @param[in] block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in] paddings    2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[in] output      Tensor output info. Data types supported: same as @p input
      *
      * @return a status
@@ -87,8 +89,8 @@ class NESpaceToBatchLayer : public IFunction
      * @param[in] input         Tensor input info. Supported tensor rank: 4. Data types supported: All.
      * @param[in] block_shape_x Block shape x value.
      * @param[in] block_shape_y Block shape y value.
-     * @param[in] padding_left  The left padding of the output tensor.
-     * @param[in] padding_right The right padding of the output tensor.
+     * @param[in] padding_left  The padding at the beginning of every dimension of the output tensor.
+     * @param[in] padding_right The padding at the end of every dimension of the output tensor.
      * @param[in] output        Tensor output info. Data types supported: same as @p input
      *
      * @return a status
@@ -99,9 +101,9 @@ class NESpaceToBatchLayer : public IFunction
     void run() override;
 
 private:
-    NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
-    NEMemsetKernel            _memset_kernel;         /**< Memset kernel to run */
-    bool                      _has_padding;           /**< Flag to check if the output has padding */
+    std::unique_ptr<NESpaceToBatchLayerKernel> _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
+    std::unique_ptr<NEMemsetKernel>            _memset_kernel;         /**< Memset kernel to run */
+    bool                                       _has_padding;           /**< Flag to check if the output has padding */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NESPACETOBATCHLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
index 16a9c80d44..1e7aae215d 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,15 +24,16 @@
 #ifndef ARM_COMPUTE_NESPACETODEPTHLAYER_H
 #define ARM_COMPUTE_NESPACETODEPTHLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
-#include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
+class NESpaceToDepthLayerKernel;
 
 /** This function calls the following NEON kernels/functions:
  *
@@ -52,7 +53,7 @@ class NESpaceToDepthLayer : public IFunction
     /** Allow instances of this class to be moved */
     NESpaceToDepthLayer &operator=(NESpaceToDepthLayer &&) = default;
     /** Default destructor */
-    virtual ~NESpaceToDepthLayer() = default;
+    ~NESpaceToDepthLayer();
     /** Set the input and output tensors.
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -74,7 +75,7 @@ class NESpaceToDepthLayer : public IFunction
     void run() override;
 
 private:
-    NESpaceToDepthLayerKernel _space_to_depth_kernel; /**< SpaceToDepth kernel to run */
+    std::unique_ptr<NESpaceToDepthLayerKernel> _space_to_depth_kernel; /**< SpaceToDepth kernel to run */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NESPACETODEPTHLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEStackLayer.h b/arm_compute/runtime/NEON/functions/NEStackLayer.h
index 4180b6da08..f6fa4f2eb3 100644
--- a/arm_compute/runtime/NEON/functions/NEStackLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEStackLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,14 +27,14 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEStackLayerKernel.h"
-
 #include <memory>
 #include <vector>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
+class NEStackLayerKernel;
 
 /** Basic function to stack tensors along an axis. This function calls the following kernel:
  *
@@ -46,6 +46,16 @@ class NEStackLayer : public IFunction
 public:
     /** Default constructor */
     NEStackLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEStackLayer(const NEStackLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEStackLayer &operator=(const NEStackLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEStackLayer(NEStackLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEStackLayer &operator=(NEStackLayer &&) = delete;
+    /** Default destructor */
+    ~NEStackLayer();
     /** Initialise the kernel's inputs vector and output.
      *
      * @note Supported input tensor rank: up to 4
@@ -73,9 +83,9 @@ class NEStackLayer : public IFunction
     void run() override;
 
 private:
-    std::vector<ITensor *>          _input;
-    std::vector<NEStackLayerKernel> _stack_kernels;
-    unsigned int                    _num_inputs;
+    std::vector<ITensor *>                           _input;
+    std::vector<std::unique_ptr<NEStackLayerKernel>> _stack_kernels;
+    unsigned int                                     _num_inputs;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NESTACKLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NETableLookup.h b/arm_compute/runtime/NEON/functions/NETableLookup.h
index fb08274761..03674cd297 100644
--- a/arm_compute/runtime/NEON/functions/NETableLookup.h
+++ b/arm_compute/runtime/NEON/functions/NETableLookup.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/NEON/functions/NEThreshold.h b/arm_compute/runtime/NEON/functions/NEThreshold.h
index cb9b696769..9860abf835 100644
--- a/arm_compute/runtime/NEON/functions/NEThreshold.h
+++ b/arm_compute/runtime/NEON/functions/NEThreshold.h
@@ -34,8 +34,13 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEThresholdKernel */
+/** Basic function to run @ref NEThresholdKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class NEThreshold : public INESimpleFunctionNoBorder
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NETile.h b/arm_compute/runtime/NEON/functions/NETile.h
index 53a94db583..d5ce76c9cf 100644
--- a/arm_compute/runtime/NEON/functions/NETile.h
+++ b/arm_compute/runtime/NEON/functions/NETile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NETileKernel */
 class NETile : public INESimpleFunctionNoBorder
diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
index 1169459f0f..2651bdd727 100644
--- a/arm_compute/runtime/NEON/functions/NETranspose.h
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to transpose a matrix on NEON. This function calls the following NEON kernel:
  *
diff --git a/arm_compute/runtime/NEON/functions/NEUnstack.h b/arm_compute/runtime/NEON/functions/NEUnstack.h
index 2e3a679664..c8e85115f7 100644
--- a/arm_compute/runtime/NEON/functions/NEUnstack.h
+++ b/arm_compute/runtime/NEON/functions/NEUnstack.h
@@ -45,6 +45,16 @@ class NEUnstack : public IFunction
 public:
     /** Default constructor */
     NEUnstack();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEUnstack(const NEUnstack &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEUnstack &operator=(const NEUnstack &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEUnstack(NEUnstack &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEUnstack &operator=(NEUnstack &&) = delete;
+    /** Default destructor */
+    ~NEUnstack() = default;
     /** Set the input, output and unstacking axis.
      *
      * @param[in]     input         A tensor to be unstacked. Data type supported: All.
diff --git a/arm_compute/runtime/NEON/functions/NEUpsampleLayer.h b/arm_compute/runtime/NEON/functions/NEUpsampleLayer.h
index f9145f1612..168845d203 100644
--- a/arm_compute/runtime/NEON/functions/NEUpsampleLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEUpsampleLayer.h
@@ -24,15 +24,17 @@
 #ifndef ARM_COMPUTE_NEUPSAMPLELAYER_H
 #define ARM_COMPUTE_NEUPSAMPLELAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 class ITensor;
+class NEUpsampleLayerKernel;
 
 /** Function to run upsample layer */
 class NEUpsampleLayer : public IFunction
@@ -40,6 +42,16 @@ class NEUpsampleLayer : public IFunction
 public:
     /** Constructor */
     NEUpsampleLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEUpsampleLayer(const NEUpsampleLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEUpsampleLayer &operator=(const NEUpsampleLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEUpsampleLayer(NEUpsampleLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEUpsampleLayer &operator=(NEUpsampleLayer &&) = delete;
+    /** Default destructor */
+    ~NEUpsampleLayer();
     /** Set the input output tensors.
      *
      * @param[in]  input  Source tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
@@ -66,8 +78,8 @@ class NEUpsampleLayer : public IFunction
     void run() override;
 
 private:
-    NEUpsampleLayerKernel _kernel;
-    DataLayout            _data_layout;
+    std::unique_ptr<NEUpsampleLayerKernel> _kernel;
+    DataLayout                             _data_layout;
 };
 } // arm_compute
 #endif /* ARM_COMPUTE_NEUPSAMPLELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEWarpAffine.h b/arm_compute/runtime/NEON/functions/NEWarpAffine.h
index eb7492b71f..0aedb87aa2 100644
--- a/arm_compute/runtime/NEON/functions/NEWarpAffine.h
+++ b/arm_compute/runtime/NEON/functions/NEWarpAffine.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,11 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to run @ref NEWarpAffineKernel */
+/** Basic function to run @ref NEWarpAffineKernel
+ *
+ * @deprecated This function is deprecated and will be removed in release 20.02
+ *
+*/
 class NEWarpAffine : public INESimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEWarpPerspective.h b/arm_compute/runtime/NEON/functions/NEWarpPerspective.h
index c439e82db5..31a1477dca 100644
--- a/arm_compute/runtime/NEON/functions/NEWarpPerspective.h
+++ b/arm_compute/runtime/NEON/functions/NEWarpPerspective.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,11 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to run @ref NEWarpPerspectiveKernel */
+/** Basic function to run @ref NEWarpPerspectiveKernel
+ *
+ * @deprecated This function is deprecated and is intended to be removed in 21.05 release
+ *
+*/
 class NEWarpPerspective : public INESimpleFunction
 {
 public:
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index 4090c8c409..6b61e7031b 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/functions/CPPPermute.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -41,6 +40,7 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ICPPKernel;
 
 /** Basic function to simulate a convolution layer. This function calls the following NEON kernels:
  * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method )
@@ -56,6 +56,12 @@ class NEWinogradConvolutionLayer : public IFunction
 public:
     /** Constructor */
     NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEWinogradConvolutionLayer(NEWinogradConvolutionLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEWinogradConvolutionLayer &operator=(NEWinogradConvolutionLayer &&) = delete;
+    /** Default destructor */
+    ~NEWinogradConvolutionLayer() = default;
 
     /** Set the input and output tensors.
      *
@@ -105,12 +111,12 @@ class NEWinogradConvolutionLayer : public IFunction
     NEWinogradConvolutionLayer &operator=(const NEWinogradConvolutionLayer &) = delete;
 
 private:
-    MemoryGroup                _memory_group;
-    NEGEMM                     _gemm_function;
-    std::unique_ptr<INEKernel> _transform_input_kernel;
-    std::unique_ptr<INEKernel> _transform_output_kernel;
-    std::unique_ptr<INEKernel> _transform_weights_kernel;
-    NEActivationLayer          _activationlayer_function;
+    MemoryGroup                 _memory_group;
+    NEGEMM                      _gemm_function;
+    std::unique_ptr<ICPPKernel> _transform_input_kernel;
+    std::unique_ptr<ICPPKernel> _transform_output_kernel;
+    std::unique_ptr<ICPPKernel> _transform_weights_kernel;
+    NEActivationLayer           _activationlayer_function;
 
     CPPPermute     _permute_input;
     CPPPermute     _permute_weights;
diff --git a/arm_compute/runtime/NEON/functions/NEYOLOLayer.h b/arm_compute/runtime/NEON/functions/NEYOLOLayer.h
index 88219602c1..4c9a5bf6e4 100644
--- a/arm_compute/runtime/NEON/functions/NEYOLOLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEYOLOLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,12 +26,12 @@
 
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
-#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEYOLOLayerKernel */
 class NEYOLOLayer : public INESimpleFunctionNoBorder
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index bb1dfec69e..8eb0762f9f 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -69,158 +69,7 @@ This archive contains:
  - A @ref utils folder containing headers with some boiler plate code used by the examples.
  - This documentation.
 
-You should have the following file organisation:
-
-	.
-	├── arm_compute --> All the arm_compute headers
-	│   ├── graph.h --> Includes all the Graph headers at once.
-	│   ├── core
-	│   │   ├── CL
-	│   │   │   ├── CLKernelLibrary.h --> Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
-	│   │   │   ├── CLKernels.h --> Includes all the OpenCL kernels at once
-	│   │   │   ├── CL specialisation of all the generic interfaces (ICLTensor, ICLArray, etc.)
-	│   │   │   ├── gemm --> Folder containing all the configuration files for GEMM
-	│   │   │   ├── kernels --> Folder containing all the OpenCL kernels
-	│   │   │   │   └── CL*Kernel.h
-	│   │   │   └── OpenCL.h --> Wrapper to configure the Khronos OpenCL C++ header
-	│   │   ├── CPP
-	│   │   │   ├── CPPKernels.h --> Includes all the CPP kernels at once
-	│   │   │   └── kernels --> Folder containing all the CPP kernels
-	│   │   │       └── CPP*Kernel.h
-	│   │   ├── GLES_COMPUTE
-	│   │   │   ├── GCKernelLibrary.h --> Manages all the GLES kernels compilation and caching, provides accessors for the GLES Context.
-	│   │   │   ├── GCKernels.h --> Includes all the GLES kernels at once
-	│   │   │   ├── GLES specialisation of all the generic interfaces (IGCTensor etc.)
-	│   │   │   ├── kernels --> Folder containing all the GLES kernels
-	│   │   │   │   └── GC*Kernel.h
-	│   │   │   └── OpenGLES.h --> Wrapper to configure the Khronos EGL and OpenGL ES C header
-	│   │   ├── NEON
-	│   │   │   ├── kernels --> Folder containing all the NEON kernels
-	│   │   │   │   ├── assembly --> headers for assembly optimised NEON kernels.
-	│   │   │   │   ├── convolution --> headers for convolution assembly optimised NEON kernels.
-	│   │   │   │   │   ├── common --> headers for code which is common to several convolution implementations.
-	│   │   │   │   │   ├── depthwise --> headers for Depthwise convolution assembly implementation
-	│   │   │   │   │   └── winograd --> headers for Winograd convolution assembly implementation
-	│   │   │   │   ├── detail --> Common code for several intrinsics implementations.
-	│   │   │   │   └── NE*Kernel.h
-	│   │   │   ├── wrapper --> NEON wrapper used to simplify code
-	│   │   │   │   ├── intrinsics --> NEON intrinsics wrappers
-	│   │   │   │   ├── scalar --> Scalar operations
-	│   │   │   │   ├── traits.h --> Traits defined on NEON vectors
-	│   │   │   │   └── wrapper.h --> Includes all wrapper headers at once
-	│   │   │   └── NEKernels.h --> Includes all the NEON kernels at once
-	│   │   ├── All common basic types (Types.h, Window, Coordinates, Iterator, etc.)
-	│   │   ├── All generic interfaces (ITensor, IArray, etc.)
-	│   │   └── Objects metadata classes (TensorInfo, MultiImageInfo)
-	│   ├── graph
-	│   │   ├── algorithms --> Generic algorithms used by the graph backend (e.g Order of traversal)
-	│   │   ├── backends --> The backend specific code
-	│   │   │   ├── CL --> OpenCL specific operations
-	│   │   │   ├── GLES  --> OpenGLES Compute Shaders specific operations
-	│   │   │   └── NEON --> NEON specific operations
-	│   │   ├── detail --> Collection of internal utilities.
-	│   │   ├── frontend --> Code related to the stream frontend interface.
-	│   │   ├── mutators --> Used to modify / optimise the Graph intermediate representation(Operator fusion, in place operations, etc.)
-	│   │   ├── nodes --> The various nodes supported by the graph API
-	│   │   ├── printers --> Debug printers
-	│   │   └── Graph objects interfaces (INode, ITensorAccessor, Graph, etc.)
-	│   └── runtime
-	│       ├── common
-	│       │   └── Common utility code used by all backends
-	│       ├── CL
-	│       │   ├── CL objects & allocators (CLArray, CLTensor, etc.)
-	│       │   ├── functions --> Folder containing all the OpenCL functions
-	│       │   │   └── CL*.h
-	│       │   ├── CLScheduler.h --> Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
-	│       │   ├── CLFunctions.h --> Includes all the OpenCL functions at once
-	│       │   ├── ICLTuner.h --> Interface used to tune the local work-group size of OpenCL kernels
-	│       │   └── tuners
-	│       │       └── Local workgroup size tuners for specific architectures / GPUs
-	│       ├── CPP
-	│       │   ├── CPPKernels.h --> Includes all the CPP functions at once.
-	│       │   ├── CPPScheduler.h --> Basic pool of threads to execute CPP/NEON code on several cores in parallel
-	│       │   └── functions --> Folder containing all the CPP functions
-	│       │       └── CPP*.h
-	│       ├── GLES_COMPUTE
-	│       │   ├── GLES objects & allocators (GCArray, GCTensor, etc.)
-	│       │   ├── functions --> Folder containing all the GLES functions
-	│       │   │   └── GC*.h
-	│       │   ├── GCScheduler.h --> Interface to enqueue GLES kernels and get/set the GLES CommandQueue.
-	│       │   └── GCFunctions.h --> Includes all the GLES functions at once
-	│       ├── NEON
-	│       │   ├── functions --> Folder containing all the NEON functions
-	│       │   │   └── NE*.h
-	│       │   └── NEFunctions.h --> Includes all the NEON functions at once
-	│       ├── OMP
-	│       │   └── OMPScheduler.h --> OpenMP scheduler (Alternative to the CPPScheduler)
-	│       ├── Memory & weights manager files (LifetimeManager, PoolManager, etc.)
-	│       └── Basic implementations of the generic object interfaces (Array, Tensor, etc.)
-	├── data --> Contains test images and reference data dumps used by validation tests
-	├── docs --> Contains Doxyfile and Doxygen sources used to generate the HTML pages.
-	├── examples
-	│   ├── gemm_tuner
-	│   │   └── OpenCL GEMM tuner utility
-	│   ├── cl_*.cpp --> OpenCL examples
-	│   ├── gc_*.cpp --> GLES compute shaders examples
-	│   ├── graph_*.cpp --> Graph examples
-	│   ├── neoncl_*.cpp --> NEON / OpenCL interoperability examples
-	│   └── neon_*.cpp --> NEON examples
-	├── include
-	│   ├── CL
-	│   │   └── Khronos OpenCL C headers and C++ wrapper
-	│   ├── half --> FP16 library available from http://half.sourceforge.net
-	│   ├── libnpy --> Library to load / write npy buffers, available from https://github.com/llohse/libnpy
-	│   ├── linux --> Headers only needed for Linux builds
-	│   │   └── Khronos EGL and OpenGLES headers
-	│   └── stb
-	│        └── stb_image.h --> Single header library to load image files, available from https://github.com/nothings/stb
-	├── scripts
-	│   ├── caffe_data_extractor.py --> Basic script to export weights from Caffe to npy files
-	│   └── tensorflow_data_extractor.py --> Basic script to export weights from Tensor Flow to npy files
-	├── src
-	│   ├── core
-	│   │   └── ... (Same structure as headers)
-	│   │       ├── CL
-	│   │       │   └── cl_kernels --> All the OpenCL kernels
-	│   │       └── GLES_COMPUTE
-	│   │           └── cs_shaders --> All the OpenGL ES Compute Shaders
-	│   ├── graph
-	│   │   └── ... (Same structure as headers)
-	│   └── runtime
-	│       └── ... (Same structure as headers)
-	├── support
-	│   └── Various headers to work around toolchains / platform issues.
-	├── tests
-	│   ├── All test related files shared between validation and benchmark
-	│   ├── benchmark --> Sources for benchmarking
-	│   │   ├── Benchmark specific files
-	│   │   ├── fixtures
-	│   │   │   └── Backend agnostic fixtures to initialise and run the functions to test.
-	│   │   ├── CL --> OpenCL benchmarking tests
-	│   │   ├── GLES_COMPUTE --> GLES benchmarking tests
-	│   │   └── NEON --> NEON benchmarking tests
-	│   ├── benchmark_examples --> Sources needed to wrap examples to run through our benchmarking framework.
-	│   ├── CL --> OpenCL accessors
-	│   ├── GLES_COMPUTE --> GLES accessors
-	│   ├── NEON --> NEON accessors
-	│   ├── datasets
-	│   │   └── Datasets for all the validation / benchmark tests, layer configurations for various networks, etc.
-	│   ├── framework
-	│   │   └── Boiler plate code for both validation and benchmark test suites (Command line parsers, instruments, output loggers, etc.)
-	│   ├── instruments --> User defined instruments that can be registered to the framework.
-	│   ├── validate_examples --> Sources needed to wrap examples to run through our validation framework.
-	│   └── validation --> Sources for validation
-	│       ├── Validation specific files
-	│       ├── fixtures
-	│       │   └── Backend agnostic fixtures to initialise and run the functions to test.
-	│       ├── reference
-	│       │   └── Reference implementation used to validate the results of the various backends.
-	│       ├── CL --> OpenCL validation tests
-	│       ├── GLES_COMPUTE --> GLES validation tests
-	│       ├── CPP --> C++ reference implementations
-	│       └── NEON --> NEON validation tests
-	└── utils --> Boiler plate code used by examples
-	    └── Various utilities to print types, load / store assets, etc.
+ For detailed information about file organization, please refer to Files -> File List section of this documentation.
 
 @section S2_versions_changelog Release versions and changelog
 
@@ -237,6 +86,308 @@ If there is more than one release in a month then an extra sequential number is
 
 @subsection S2_2_changelog Changelog
 
+v20.11 Public major release
+ - Various bug fixes.
+ - Various optimisations.
+ - Performance regressions can be noted when executing Depthwise Convolution on Neon with a depth multiplier > 1 for quantized data type.
+   This is planned to be resolved in 21.02 release.
+ - Added new data type QASYMM8_SIGNED support for @ref NEROIAlignLayer.
+ - Added new data type S32 support for:
+   - @ref NEArithmeticSubtraction
+   - @ref NEArithmeticSubtractionKernel
+   - @ref NEPixelWiseMultiplication
+   - @ref NEPixelWiseMultiplicationKernel
+   - @ref NEElementwiseDivision
+   - @ref NEDivisionOperationKernel
+ - Interface change
+   - Properly support softmax axis to have the same meaning as other major frameworks. That is, axis now defines the dimension
+     on which Softmax/Logsoftmax is performed. E.g. for input of shape 4x5x6 and axis=1, softmax will be applied to 4x6=24 vectors of size 5.
+     The supported value range of axis is [-rank, rank).
+     This change applies to the following functions:
+      - @ref NESoftmaxLayer
+      - @ref NELogSoftmaxLayer
+      - @ref CLSoftmaxLayer
+      - @ref CLLogSoftmaxLayer
+      - @ref GCSoftmaxLayer
+ - New OpenCL kernels / functions:
+   - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+   - @ref CLLogicalNot
+   - @ref CLLogicalAnd
+   - @ref CLLogicalOr
+ - New NEON kernels / functions:
+   - @ref NELogicalNot
+   - @ref NELogicalAnd
+   - @ref NELogicalOr
+ - Removed padding from NEON kernels:
+   - @ref NEComplexPixelWiseMultiplicationKernel
+   - @ref NENonMaximaSuppression3x3Kernel
+   - @ref NERemapKernel
+   - @ref NEGEMMInterleave4x4Kernel
+   - @ref NEDirectConvolutionLayerKernel
+   - @ref NEScaleKernel
+   - @ref NELocallyConnectedMatrixMultiplyKernel
+   - @ref NEGEMMLowpOffsetContributionKernel
+   - @ref NEGEMMTranspose1xWKernel
+   - @ref NEPoolingLayerKernel
+   - @ref NEConvolutionKernel
+   - @ref NEDepthwiseConvolutionLayerNativeKernel
+   - @ref NEGEMMLowpMatrixMultiplyKernel
+   - @ref NEGEMMMatrixMultiplyKernel
+   - @ref NEDirectConvolutionLayerOutputStageKernel
+   - @ref NEReductionOperationKernel
+   - @ref NEGEMMLowpMatrixAReductionKernel
+   - @ref NEGEMMLowpMatrixBReductionKernel
+ - Removed padding from OpenCL kernels:
+   - @ref CLBatchConcatenateLayerKernel
+   - @ref CLElementwiseOperationKernel
+   - @ref CLBatchNormalizationLayerKernel
+   - @ref CLPoolingLayerKernel
+   - @ref CLWinogradInputTransformKernel
+   - @ref CLGEMMLowpMatrixMultiplyNativeKernel
+   - @ref CLGEMMLowpMatrixAReductionKernel
+   - @ref CLGEMMLowpMatrixBReductionKernel
+   - @ref CLGEMMLowpOffsetContributionOutputStageKernel
+   - @ref CLGEMMLowpOffsetContributionKernel
+   - @ref CLWinogradOutputTransformKernel
+   - @ref CLGEMMLowpMatrixMultiplyReshapedKernel
+   - @ref CLFuseBatchNormalizationKernel
+   - @ref CLDepthwiseConvolutionLayerNativeKernel
+   - @ref CLDepthConvertLayerKernel
+   - @ref CLCopyKernel
+   - @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
+   - @ref CLActivationLayerKernel
+   - @ref CLWinogradFilterTransformKernel
+   - @ref CLWidthConcatenateLayerKernel
+   - @ref CLWidthConcatenate4TensorsKernel
+   - @ref CLWidthConcatenate2TensorsKernel
+   - @ref CLLogits1DMaxShiftExpSumKernel
+   - @ref CLLogits1DNormKernel
+   - @ref CLHeightConcatenateLayerKernel
+   - @ref CLGEMMMatrixMultiplyKernel
+   - @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
+   - @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
+   - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
+   - @ref CLDepthConcatenateLayerKernel
+   - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+ - Removed OpenCL kernels / functions:
+   - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+   - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+   - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ - Deprecated OpenCL kernels / functions (If a kernel is used only by the function that is being deprecated, the kernel is deprecated together):
+     - CLLocallyConnectedLayer
+     - CLLocallyConnectedMatrixMultiplyKernel
+     - CLAbsoluteDifference
+     - CLAbsoluteDifferenceKernel
+     - CLAccumulate
+     - CLAccumulateKernel
+     - CLAccumulateSquared
+     - CLAccumulateSquaredKernel
+     - CLAccumulateWeighted
+     - CLAccumulateWeightedKernel
+     - CLAccumulateWeightedFP16Kernel
+     - CLBox3x3
+     - CLBox3x3Kernel
+     - CLBox3x3FP16Kernel
+     - CLCannyEdge
+     - CLChannelCombine
+     - CLChannelCombineKernel
+     - CLChannelExtract
+     - CLChannelExtractKernel
+     - CLColorConvert
+     - CLColorConvertKernel
+     - CLConvolution3x3
+     - CLConvolutionRectangle
+     - CLConvolutionRectangleKernel
+     - CLConvolutionSquare
+     - CLConvolutionKernel
+     - CLDerivative
+     - CLDerivativeKernel
+     - CLDilate
+     - CLDilateKernel
+     - CLEqualizeHistogram
+     - CLErode
+     - CLErodeKernel
+     - CLFastCorners
+     - CLFastCornersKernel
+     - CLGaussian3x3
+     - CLGaussian3x3Kernel
+     - CLGaussian5x5
+     - CLGaussian5x5HorKernel
+     - CLGaussian5x5VertKernel
+     - CLGaussianPyramid
+     - CLGaussianPyramidHalf
+     - CLGaussianPyramidOrb
+     - CLHarrisCorners
+     - CLHarrisScoreKernel
+     - CLHarrisScoreFP16Kernel
+     - CLHistogram
+     - CLHistogramKernel
+     - CLHOGOrientationBinningKernel
+     - CLHOGBlockNormalizationKernel
+     - CLHOGDetectorKernel
+     - CLHOGNonMaximaSuppressionKernel
+     - CLHOGDescriptor
+     - CLHOGDetector
+     - CLHOGGradient
+     - CLHOGMultiDetection
+     - CLHOGOrientationBinningKernel
+     - CLHOGBlockNormalizationKernel
+     - CLHOGDetectorKernel
+     - CLIntegralImage
+     - CLIntegralImageKernel
+     - CLLaplacianReconstruct
+     - CLLaplacianPyramid
+     - CLMagnitude
+     - CLMagnitudePhaseKernel
+     - CLMedian3x3
+     - CLMedian3x3Kernel
+     - CLMinMaxLocation
+     - CLMinMaxLocationKernel
+     - CLNonLinearFilter
+     - CLNonLinearFilterKernel
+     - CLNonMaximaSuppression3x3
+     - CLNonMaximaSuppression3x3FP16Kernel
+     - CLNonMaximaSuppression3x3Kernel
+     - CLOpticalFlow
+     - CLPhase
+     - CLRemap
+     - CLRemapKernel
+     - CLScharr3x3
+     - CLScharr3x3Kernel
+     - CLSobel3x3
+     - CLSobel3x3Kernel
+     - CLSobel5x5
+     - CLSobel5x5HorKernel
+     - CLSobel5x5VertKernel
+     - CLSobel7x7
+     - CLSobel7x7HorKernel
+     - CLSobel7x7VertKernel
+     - CLThreshold
+     - CLThresholdKernel
+     - CLWarpAffine
+     - CLWarpAffineKernel
+     - CLWarpPerspective
+     - CLWarpPerspectiveKernel
+ - Deprecated NEON kernels / functions (If a kernel is used only by the function that is being deprecated, the kernel is deprecated together):
+     - NELocallyConnectedLayer
+     - NELocallyConnectedMatrixMultiplyKernel
+     - NEAbsoluteDifference
+     - NEAbsoluteDifferenceKernel
+     - NEAccumulate
+     - NEAccumulateKernel
+     - NEAccumulateSquared
+     - NEAccumulateSquaredKernel
+     - NEAccumulateWeighted
+     - NEAccumulateWeightedKernel
+     - NEAccumulateWeightedFP16Kernel
+     - NEBox3x3
+     - NEBox3x3Kernel
+     - NEBox3x3FP16Kernel
+     - NECannyEdge
+     - NEChannelCombine
+     - NEChannelCombineKernel
+     - NEChannelExtract
+     - NEChannelExtractKernel
+     - NEColorConvert
+     - NEColorConvertKernel
+     - NEConvolution3x3
+     - NEConvolutionRectangle
+     - NEConvolutionRectangleKernel
+     - NEConvolutionSquare
+     - NEConvolutionKernel
+     - NEDerivative
+     - NEDerivativeKernel
+     - NEDilate
+     - NEDilateKernel
+     - NEEqualizeHistogram
+     - NEErode
+     - NEErodeKernel
+     - NEFastCorners
+     - NEFastCornersKernel
+     - NEGaussian3x3
+     - NEGaussian3x3Kernel
+     - NEGaussian5x5
+     - NEGaussian5x5HorKernel
+     - NEGaussian5x5VertKernel
+     - NEGaussianPyramid
+     - NEGaussianPyramidHalf
+     - NEGaussianPyramidOrb
+     - NEHarrisCorners
+     - NEHarrisScoreKernel
+     - NEHarrisScoreFP16Kernel
+     - NEHistogram
+     - NEHistogramKernel
+     - NEHOGOrientationBinningKernel
+     - NEHOGBlockNormalizationKernel
+     - NEHOGDetectorKernel
+     - NEHOGNonMaximaSuppressionKernel
+     - NEHOGDescriptor
+     - NEHOGDetector
+     - NEHOGGradient
+     - NEHOGMultiDetection
+     - NEHOGOrientationBinningKernel
+     - NEHOGBlockNormalizationKernel
+     - NEHOGDetectorKernel
+     - NEIntegralImage
+     - NEIntegralImageKernel
+     - NELaplacianReconstruct
+     - NELaplacianPyramid
+     - NEMagnitude
+     - NEMagnitudePhaseKernel
+     - NEMedian3x3
+     - NEMedian3x3Kernel
+     - NEMinMaxLocation
+     - NEMinMaxLocationKernel
+     - NENonLinearFilter
+     - NENonLinearFilterKernel
+     - NENonMaximaSuppression3x3
+     - NENonMaximaSuppression3x3FP16Kernel
+     - NENonMaximaSuppression3x3Kernel
+     - NEOpticalFlow
+     - NEPhase
+     - NERemap
+     - NERemapKernel
+     - NEScharr3x3
+     - NEScharr3x3Kernel
+     - NESobel3x3
+     - NESobel3x3Kernel
+     - NESobel5x5
+     - NESobel5x5HorKernel
+     - NESobel5x5VertKernel
+     - NESobel7x7
+     - NESobel7x7HorKernel
+     - NESobel7x7VertKernel
+     - NEThreshold
+     - NEThresholdKernel
+     - NEWarpAffine
+     - NEWarpAffineKernel
+     - NEWarpPerspective
+     - NEWarpPerspectiveKernel
+ - Deprecated GLES kernels / functions (If a kernel is used only by the function that is being deprecated, the kernel is deprecated together):
+     - GCAbsoluteDifference
+     - GCActivationLayer
+     - GCArithmeticAddition
+     - GCBatchNormalizationLayer
+     - GCConcatenateLayer
+     - GCConvolutionLayer
+     - GCDepthwiseConvolutionLayer
+     - GCDirectConvolutionLayer
+     - GCDropoutLayer
+     - GCFillBorder
+     - GCFullyConnectedLayer
+     - GCGEMM
+     - GCGEMMInterleave4x4
+     - GCGEMMTranspose1xW
+     - GCNormalizationLayer
+     - GCNormalizePlanarYUVLayer
+     - GCPixelWiseMultiplication
+     - GCPoolingLayer
+     - GCScale
+     - GCSoftmaxLayer
+     - GCTensorShift
+     - GCTranspose
+
+
 v20.08 Public major release
  - Various bug fixes.
  - Various optimisations.
@@ -284,7 +435,7 @@ v20.08 Public major release
    - @ref NEDepthConvertLayerKernel
    - @ref NERangeKernel
    - @ref NEPriorBoxLayer
- - Removedd OpenCL kernels / functions:
+ - Removed OpenCL kernels / functions:
    - CLGEMMLowpQuantizeDownInt32ToUint8Scale
    - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat
  - Removed NEON kernels / functions:
@@ -394,7 +545,7 @@ v20.02 Public major release
      - @ref NEComparisonOperationKernel
      - @ref NEConvolutionLayer
      - @ref NEDepthwiseConvolutionLayer
-     - @ref NEDepthwiseConvolutionLayer3x3Kernel
+     - NEDepthwiseConvolutionLayer3x3Kernel
      - @ref NEDirectConvolutionLayerOutputStageKernel
      - @ref NEElementwiseComparison
      - @ref NEElementwiseMax
@@ -406,13 +557,13 @@ v20.02 Public major release
      - @ref NEPoolingLayer
      - @ref NEPReluLayer
  - Added support for QSYMM8_PER_CHANNEL in:
-     - @ref NEDepthwiseConvolutionLayer3x3Kernel
+     - NEDepthwiseConvolutionLayer3x3Kernel
  - Added support for split sizes in:
      - @ref CLSplit
      - @ref NESplit
  - New OpenCL kernels / functions:
      - @ref CLFill
-     - @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
+     - CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
  - New NEON kernels / functions:
      - @ref NEFill
      - @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel / @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
@@ -546,7 +697,7 @@ v19.08 Public major release
     - @ref CLBatchConcatenateLayerKernel
     - @ref CLDepthToSpaceLayerKernel / @ref CLDepthToSpaceLayer
     - @ref CLGEMMLowpMatrixMultiplyNativeKernel
-    - @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+    - CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
     - @ref CLGEMMMatrixMultiplyNativeKernel
     - @ref CLMeanStdDevNormalizationKernel / @ref CLMeanStdDevNormalizationLayer
     - @ref CLSpaceToDepthLayerKernel / @ref CLSpaceToDepthLayer
@@ -774,11 +925,11 @@ v18.11 Public major release
     - @ref CLL2NormalizeLayer
  - Added QASYMM8 support to the following kernels:
     - @ref CLScaleKernel
-    - @ref NEDepthwiseConvolutionLayer3x3Kernel
+    - NEDepthwiseConvolutionLayer3x3Kernel
     - @ref CLPixelWiseMultiplicationKernel
  - Added FP16 support to the following kernels:
     - @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
-    - @ref NEDepthwiseConvolutionLayer3x3Kernel
+    - NEDepthwiseConvolutionLayer3x3Kernel
     - @ref CLNormalizePlanarYUVLayerKernel
     - @ref CLWinogradConvolutionLayer (5x5 kernel)
  - More tests added to both validation and benchmarking suites.
@@ -929,7 +1080,7 @@ v18.01 Public maintenance release
  - Refactored NEON Winograd (NEWinogradLayerKernel)
  - Added @ref NEDirectConvolutionLayerOutputStageKernel
  - Added QASYMM8 support to the following NEON kernels:
-    - @ref NEDepthwiseConvolutionLayer3x3Kernel
+    - NEDepthwiseConvolutionLayer3x3Kernel
     - @ref NEFillBorderKernel
     - @ref NEPoolingLayerKernel
  - Added new examples:
@@ -973,14 +1124,14 @@ v17.12 Public major release
  - New NEON kernels / functions
     - arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore
     - arm_compute::NEHGEMMAArch64FP16Kernel
-    - @ref NEDepthwiseConvolutionLayer3x3Kernel / NEDepthwiseIm2ColKernel / NEGEMMMatrixVectorMultiplyKernel / NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer
+    - NEDepthwiseConvolutionLayer3x3Kernel / NEDepthwiseIm2ColKernel / NEGEMMMatrixVectorMultiplyKernel / NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer
     - @ref NEGEMMLowpOffsetContributionKernel / @ref NEGEMMLowpMatrixAReductionKernel / @ref NEGEMMLowpMatrixBReductionKernel / @ref NEGEMMLowpMatrixMultiplyCore
     - @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
     - NEWinogradLayer / NEWinogradLayerKernel
 
  - New OpenCL kernels / functions
     - @ref CLGEMMLowpOffsetContributionKernel / @ref CLGEMMLowpMatrixAReductionKernel / @ref CLGEMMLowpMatrixBReductionKernel / @ref CLGEMMLowpMatrixMultiplyCore
-    - @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
+    - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
 
  - New graph nodes for NEON and OpenCL
     - graph::BranchLayer
@@ -1328,7 +1479,7 @@ or simply remove the build parameter as build=cross_compile is the default value
 
 The examples get automatically built by scons as part of the build process of the library described above. This section just describes how you can build and link your own application against our library.
 
-@note The following command lines assume the arm_compute binaries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built library with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.
+@note The following command lines assume the arm_compute libraries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built libraries with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.
 
 To cross compile a NEON example for Linux 32bit:
 
@@ -1433,9 +1584,9 @@ For Android, the library was successfully built and tested using Google's standa
 
 Here is a guide to <a href="https://developer.android.com/ndk/guides/standalone_toolchain.html">create your Android standalone toolchains from the NDK</a>
 
-- Download the NDK r18b from here: https://developer.android.com/ndk/downloads/index.html
+- Download the NDK r18b from here: https://developer.android.com/ndk/downloads/index.html to directory $NDK
 - Make sure you have Python 2.7 installed on your machine.
-- Generate the 32 and/or 64 toolchains by running the following commands:
+- Generate the 32 and/or 64 toolchains by running the following commands to your toolchain dirctory $MY_TOOLCHAINS:
 
 
 	$NDK/build/tools/make_standalone_toolchain.py --arch arm64 --install-dir $MY_TOOLCHAINS/aarch64-linux-android-ndk-r18b --stl libc++ --api 21
@@ -1465,7 +1616,7 @@ To cross-compile the library in asserts mode, with GLES_COMPUTE only support, fo
 
 The examples get automatically built by scons as part of the build process of the library described above. This section just describes how you can build and link your own application against our library.
 
-@note The following command lines assume the arm_compute binaries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built library with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.
+@note The following command lines assume the arm_compute libraries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built libraries with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.
 
 Once you've got your Android standalone toolchain built and added to your path you can do the following:
 
@@ -1649,7 +1800,7 @@ CLTuner looks for the optimal LWS for each unique OpenCL kernel configuration. S
 
 @subsubsection S3_7_1_cl_tuner_how_to How to use it
 
-All the graph examples in the ACL's folder "examples" and the arm_compute_benchmark accept an argument to enable the OpenCL tuner and an argument to export/import the LWS values to/from a file
+All the graph examples in the Compute Library's folder "examples" and the arm_compute_benchmark accept an argument to enable the OpenCL tuner and an argument to export/import the LWS values to/from a file
 
     #Enable CL tuner
     ./graph_mobilenet --enable-tuner –-target=CL
diff --git a/docs/01_library.dox b/docs/01_library.dox
index ea29b75cd3..742a246582 100644
--- a/docs/01_library.dox
+++ b/docs/01_library.dox
@@ -43,7 +43,34 @@ The Runtime library is a very basic wrapper around the Core library which can be
 
 For maximum performance, it is expected that the users would re-implement an equivalent to the runtime library which suits better their needs (With a more clever multi-threading strategy, load-balancing between NEON and OpenCL, etc.)
 
-@section S4_1_2 Thread-safety
+@section S4_1_2 Data-type and Data-layout support
+
+Compute Library supports a wide list of data-types, information can been directly found in the documentation of each kernel/function.
+The main data-types that the Machine Learning functions support are the following:
+- BFLOAT16: 16-bit non-standard brain floating point
+- F16: 16-bit half precision floating point
+- F32: 32-bit single precision floating point
+- QASYMM8: 8-bit unsigned asymmetric quantized
+- QASYMM8_SIGNED: 8-bit signed asymmetric quantized
+- QSYMM8_PER_CHANNEL: 8-bit signed symmetric quantized (Used for the weights)
+
+Moreover, Compute Library supports the following data layouts (fast changing dimension from right to left):
+- NHWC: The native layout of Compute Library that delivers the best performance where channels are in the fastest changing dimension
+- NCHW: Legacy layout where width is in the fastest changing dimension
+where N = batches, C = channels, H = height, W = width
+
+@section S4_1_3 Fast-math support
+
+Compute Library supports different types of convolution methods, fast-math flag is only used for the Winograd algorithm.
+When the fast-math flag is enabled, both NEON and CL convolution layers will try to dispatch the fastest implementation available, which may introduce a drop in accuracy as well. The different scenarios involving the fast-math flag are presented below:
+- For FP32:
+    - no-fast-math: Only supports Winograd 3x3,3x1,1x3,5x1,1x5,7x1,1x7
+    - fast-math: Supports Winograd 3x3,3x1,1x3,5x1,1x5,7x1,1x7,5x5,7x7
+- For fp16:
+    - no-fast-math: No Winograd support
+    - fast-math: Supports Winograd 3x3,3x1,1x3,5x1,1x5,7x1,1x7,5x5,7x7
+
+@section S4_1_4 Thread-safety
 
 Although the library supports multi-threading during workload dispatch, thus parallelizing the execution of the workload at multiple threads, the current runtime module implementation is not thread-safe in the sense of executing different functions from separate threads.
 This lies to the fact that the provided scheduling mechanism wasn't designed with thread-safety in mind.
diff --git a/docs/02_tests.dox b/docs/02_tests.dox
index a813844403..c46e1f5663 100644
--- a/docs/02_tests.dox
+++ b/docs/02_tests.dox
@@ -45,28 +45,6 @@ information is needed within the test (e.g. to validate the results).
 
 @note Tests are not included in the pre-built binary archive, you have to build them from sources.
 
-@subsection tests_overview_structure Directory structure
-
-    .
-    `-- tests <- Top level test directory. All files in here are shared among validation and benchmark.
-        |-- framework <- Underlying test framework.
-        |-- CL             \
-        |-- GLES_COMPUTE   \
-        |-- NEON -> Backend specific files with helper functions etc.
-        |-- benchmark <- Top level directory for the benchmarking files.
-        |   |-- fixtures <- Fixtures for benchmark tests.
-        |   |-- CL <- OpenCL backend test cases on a function level.
-        |   |-- GLES_COMPUTE <- Same of OpenGL ES
-        |   `-- NEON <- Same for NEON
-        |-- datasets <- Datasets for benchmark and validation tests.
-        |-- main.cpp <- Main entry point for the tests. Currently shared between validation and benchmarking.
-        `-- validation -> Top level directory for validation files.
-            |-- CPP -> C++ reference code
-            |-- CL             \
-            |-- GLES_COMPUTE   \
-            |-- NEON -> Backend specific test cases
-            `-- fixtures -> Fixtures shared among all backends. Used to setup target function and tensors.
-
 @subsection tests_overview_fixtures Fixtures
 
 Fixtures can be used to share common setup, teardown or even run tasks among
diff --git a/docs/03_scripts.dox b/docs/03_scripts.dox
index efa6fa9813..7e16edfb0d 100644
--- a/docs/03_scripts.dox
+++ b/docs/03_scripts.dox
@@ -143,6 +143,11 @@ The arm_compute::utils::load_trained_data shows how one could load
 the weights and biases into tensor from the .npy file by the help of Accessor.
 
 @section validate_examples Validating examples
+
+Compute Library provides a list of graph examples that are used in the context of integration and performance testing.
+The provenance of each model is part of its documentation and no structural or data alterations have been applied to any
+of them unless explicitly specified otherwise in the documentation.
+
 Using one of the provided scripts will generate files containing the trainable parameters.
 
 You can validate a given graph example on a list of inputs by running:
diff --git a/docs/04_adding_operator.dox b/docs/04_adding_operator.dox
index c40aaa3828..13be712549 100644
--- a/docs/04_adding_operator.dox
+++ b/docs/04_adding_operator.dox
@@ -30,7 +30,7 @@ namespace arm_compute
 @tableofcontents
 
 @section S4_1_introduction Introduction
-In ACL there are two main parts or modules:
+In Compute Library there are two main parts or modules:
 - The core library consists of a low-level collection of algorithms implemented in C++ and optimized for Arm CPUs and GPUs. The core module is designed to be embedded in other projects and it doesn't perform any memory management or scheduling.
 - The runtime library is a wrapper of the core library and provides other additional features like memory management, multithreaded execution of workloads and allocation of the intermediate tensors.
 
@@ -41,7 +41,7 @@ Apart from these components that get linked into the application, the sources al
 
 @section S4_1_supporting_new_operators Supporting new operators
 
-Following are the steps involved in adding support for a new operator in ACL
+Following are the steps involved in adding support for a new operator in Compute Library
 - Add new data types (if required)
 - Add the kernel to the core library.
 - Add the function to the runtime library.
@@ -52,7 +52,7 @@ Following are the steps involved in adding support for a new operator in ACL
 
 @subsection S4_1_1_add_datatypes Adding new data types
 
-The ACL declares a few new datatypes related to ACL's domain, kernels, and functions in the library process Tensors and Images (Computer Vision functions). Tensors are multi-dimensional arrays with a maximum of Coordinates::num_max_dimensions dimensions; depending on the number of dimensions tensors can be interpreted as various objects. A scalar can be represented as a zero-dimensional tensor and a vector of numbers can be represented as a one-dimensional tensor. Furthermore, an image is just a 2D tensor, a 3D tensor can be seen as an array of images and a 4D tensor as a 2D array of images, etc.
+Compute Library declares a few new datatypes related to its domain, kernels, and functions in the library process Tensors and Images (Computer Vision functions). Tensors are multi-dimensional arrays with a maximum of Coordinates::num_max_dimensions dimensions; depending on the number of dimensions tensors can be interpreted as various objects. A scalar can be represented as a zero-dimensional tensor and a vector of numbers can be represented as a one-dimensional tensor. Furthermore, an image is just a 2D tensor, a 3D tensor can be seen as an array of images and a 4D tensor as a 2D array of images, etc.
 All the datatype classes or structures are grouped in the core library folder arm_compute/core  like the @ref ITensor, @ref ITensorInfo (all the information of a tensor), TensorShape and simpler types are in arm_compute/core/Types.h.
 
 If an operator handles a new datatype, it must be added to the library. While adding a new data type to the library, it's necessary to implement the function to enable printing, the to_string() method and the output stream insertion (<<) operator. Every datatype implements these two functions in utils/TypePrinter.h
@@ -65,13 +65,13 @@ And for printing:
 
 @snippet utils/TypePrinter.h Print DataLayout type
 
-In the ACL library, we use namespaces to group all the operators, functions, classes and interfaces. The main namespace to use is arm_compute. In the test suite, the test framework and the individual tests use nested namespaces like @ref test::validation or @ref test::benchmark to group the different purposes of various parts of the suite.
+In Compute Library, we use namespaces to group all the operators, functions, classes and interfaces. The main namespace to use is arm_compute. In the test suite, the test framework and the individual tests use nested namespaces like @ref test::validation or @ref test::benchmark to group the different purposes of various parts of the suite.
 Utility functions like conversion or type cast operators, that are shared by multiple operators are in arm_compute/core/Utils.h. Non-inlined function definitions go in the corresponding .cpp files in the src folder.
 Similarly, all common functions that process shapes, like calculating output shapes of an operator or shape conversions etc are in arm_compute/core/utils/misc/ShapeCalculator.h.
 
 
 @subsection S4_1_2_add_kernel Add a kernel
-As we mentioned at the beginning, the kernel is the implementation of the operator or algorithm partially using a specific programming language related to the backend we want to use. Adding a kernel in the library means implementing the algorithm in a SIMD technology like NEON or OpenCL. All kernels in ACL must implement a common interface IKernel or one of the specific subinterfaces.
+As we mentioned at the beginning, the kernel is the implementation of the operator or algorithm partially using a specific programming language related to the backend we want to use. Adding a kernel in the library means implementing the algorithm in a SIMD technology like NEON or OpenCL. All kernels in Compute Library must implement a common interface IKernel or one of the specific subinterfaces.
 IKernel is the common interface for all the kernels in the core library, it contains the main methods for configure and run the kernel itself, such as window()  that return the maximum window the kernel can be executed on or is_parallelisable() for indicate whether or not the kernel is parallelizable. If the kernel is parallelizable then the window returned by the window() method can be split into sub-windows which can then be run in parallel, in the other case, only the window returned by window() can be passed to the run method.
 There are specific interfaces for OpenCL and Neon: @ref ICLKernel, INEKernel (using INEKernel = @ref ICPPKernel).
 
@@ -80,7 +80,7 @@ There are specific interfaces for OpenCL and Neon: @ref ICLKernel, INEKernel (us
 
 There are two others implementation of @ref IKernel called @ref ICLSimpleKernel and INESimpleKernel, they are the interface for simple kernels that have just one input tensor and one output tensor.
 Creating a new kernel implies adding new files:
-- arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
+- src/core/CL/kernels/CLReshapeLayerKernel.h
 - src/core/CL/cl_kernels/reshape_layer.cl
 - src/core/CL/kernels/CLReshapeLayerKernel.cpp
 - src/core/CL/CLKernelLibrary.cpp
@@ -90,16 +90,16 @@ Neon kernel
 - src/core/NEON/kernels/NEReshapeLayerKernel.cpp
 
 We must register the new layer in the respective libraries:
-- arm_compute/core/CL/CLKernels.h
+- src/core/CL/CLKernels.h
 - arm_compute/core/NEON/NEKernels.h
 
-These files contain the list of all kernels available in the corresponding ACL's backend, for example CLKernels:
+These files contain the list of all kernels available in the corresponding Compute Library's backend, for example CLKernels:
 @code{.cpp}
 ... 
-#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+#include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
+#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
 ... 
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "src/core/CL/kernels/CLReshapeLayerKernel.h"
 ... 
 
 @endcode
@@ -138,7 +138,7 @@ If the tests are already in place, the new kernel can be tested using the existi
 - (sub[n].end() - sub[n].start()) % max[n].step() == 0
 
 @ref CPPScheduler::schedule provides a sample implementation that is used for NEON kernels.
-%Memory management is the other aspect that the runtime layer is supposed to handle. %Memory management of the tensors is abstracted using TensorAllocator. Each tensor holds a pointer to a TensorAllocator object, which is used to allocate and free the memory at runtime. The implementation that is currently supported in ACL allows memory blocks, required to be fulfilled for a given operator, to be grouped together under a @ref MemoryGroup. Each group can be acquired and released. The underlying implementation of memory groups vary depending on whether NEON or CL is used. The memory group class uses memory pool to provide the required memory. It also uses the memory manager to manage the lifetime and a IPoolManager to manage the memory pools registered with the memory manager.
+%Memory management is the other aspect that the runtime layer is supposed to handle. %Memory management of the tensors is abstracted using TensorAllocator. Each tensor holds a pointer to a TensorAllocator object, which is used to allocate and free the memory at runtime. The implementation that is currently supported in Compute Library allows memory blocks, required to be fulfilled for a given operator, to be grouped together under a @ref MemoryGroup. Each group can be acquired and released. The underlying implementation of memory groups vary depending on whether NEON or CL is used. The memory group class uses memory pool to provide the required memory. It also uses the memory manager to manage the lifetime and a IPoolManager to manage the memory pools registered with the memory manager.
 
 
 We have seen the various interfaces for a kernel in the core library, the same structure the same file structure design exists in the runtime module. IFunction is the base class for all the functions, it has two child interfaces: ICLSimpleFunction and INESimpleFunction that are used as base class for functions which call a single kernel.
@@ -268,7 +268,7 @@ The refence implementation consist of two files into the folder tests/validation
 
 where we will put respectively the declaration and definition of the new operator.
 All the utility functions that are used ONLY in the tests are in test/validation/helpers.h, for all the others, as mentioned before, there are helpers in the library.
-ACL and the tests do use templates, the reference implementation is a generic implementation independent from the datatype and we use the templates to generalize the datatype concept.
+Compute Library and the tests do use templates, the reference implementation is a generic implementation independent from the datatype and we use the templates to generalize the datatype concept.
 Following the example, let's have a look at the ReshapeLayer operator:
 
 - tests/validation/reference/ReshapeLayer.h
diff --git a/docs/05_contribution_guidelines.dox b/docs/05_contribution_guidelines.dox
index abe0bc90b5..1cdd129733 100644
--- a/docs/05_contribution_guidelines.dox
+++ b/docs/05_contribution_guidelines.dox
@@ -358,6 +358,52 @@ std::memcpy(out.ptr(), in.ptr(), element_size);
 - **Sanitize data sent to other systems**. Sanitize all data passed to complex subsystems such as command shells, relational databases, and commercial off-the-shelf (COTS) components. Attackers may be able to invoke unused functionality in these components through the use of various injection attacks. This is not necessarily an input validation problem because the complex subsystem being invoked does not understand the context in which the call is made. Because the calling process understands the context, it is responsible for sanitizing the data before invoking the subsystem.
 - **Practice defense in depth**. Manage risk with multiple defensive strategies, so that if one layer of defense turns out to be inadequate, another layer of defense can prevent a security flaw from becoming an exploitable vulnerability and/or limit the consequences of a successful exploit. For example, combining secure programming techniques with secure runtime environments should reduce the likelihood that vulnerabilities remaining in the code at deployment time can be exploited in the operational environment.
 
+@subsection S5_1_5_guidelines_for_stable_api_abi Guidelines for stable API/ABI
+
+The Application Programming Interface (API) and Application Binary Interface (ABI) are the interfaces exposed
+to users so their programs can interact with the library efficiently and effectively. Even though changing API/ABI
+in a way that does not give backward compatibility is not necessarily bad if it can improve other users' experience and the library,
+contributions should be made with the awareness of API/ABI stability. If you'd like to make changes that affects
+the library's API/ABI, please review and follow the guidelines shown in this section. Also, please note that
+these guidelines are not exhaustive list but discussing things that might be easily overlooked.
+
+@subsubsection S5_1_5_1_guidelines_for_api Guidelines for API
+
+- When adding new arguments, consider grouping arguments (including the old ones) into a struct rather than adding arguments with default values.
+Introducing a new struct might break the API/ABI once, but it will be helpful to keep the stability.
+- When new member variables are added, please make sure they are initialized.
+- Avoid adding enum elements in the middle.
+- When removing arguments, follow the deprecation process described in the following section.
+- When changing behavior affecting API contracts, follow the deprecation process described in the following section.
+
+@subsubsection S5_1_5_2_guidelines_for_abi Guidelines for ABI
+
+We recommend to read through <a href="https://community.kde.org/Policies/Binary_Compatibility_Issues_With_C%2B%2B">this page</a>
+and double check your contributions to see if they include the changes listed.
+
+Also, for classes that requires strong ABI stability, consider using <a href="https://en.cppreference.com/w/cpp/language/pimpl">pImpl idiom</a>.
+
+@subsubsection S5_1_5_3_api_deprecation_process API deprecation process
+
+In order to deprecate an existing API, these rules should be followed.
+
+- Removal of a deprecated API should wait at least for one official release.
+- Deprecation of runtime APIs should strictly follow the aforementioned period, whereas core APIs can have more flexibility as they are mostly used internally rather than user-facing.
+- Any API changes (update, addition and deprecation) in all components should be well documented by the contribution itself.
+
+Also, it is recommended to use the following utility macros which is designed to work with both clang and gcc using C++11 and later.
+
+- ARM_COMPUTE_DEPRECATED: Just deprecate the wrapped function
+- ARM_COMPUTE_DEPRECATED_REL: Deprecate the wrapped function and also capture the release that was deprecated
+- ARM_COMPUTE_DEPRECATED_REL_REPLACE: Deprecate the wrapped function and also capture the release that was deprecated along with a possible replacement candidate
+
+@code{.cpp}
+ARM_COMPUTE_DEPRECATED_REL_REPLACE(20.08, DoNewThing)
+void DoOldThing();
+
+void DoNewThing();
+@endcode
+
 @section S5_2_how_to_submit_a_patch How to submit a patch
 
 To be able to submit a patch to our development repository you need to have a GitHub account. With that, you will be able to sign in to Gerrit where your patch will be reviewed.
diff --git a/docs/06_functions_list.dox b/docs/06_functions_list.dox
index ac944610dc..c8006c6c3d 100644
--- a/docs/06_functions_list.dox
+++ b/docs/06_functions_list.dox
@@ -54,6 +54,9 @@ namespace arm_compute
         - @ref NEExpLayer
         - @ref NEGaussian3x3
         - @ref NEIntegralImage
+        - @ref NELogicalAnd
+        - @ref NELogicalNot
+        - @ref NELogicalOr
         - @ref NEMedian3x3
         - @ref NENonLinearFilter
         - @ref NENonMaximaSuppression3x3
@@ -141,8 +144,8 @@ namespace arm_compute
         - @ref NEGaussianPyramidOrb
     - @ref NEGEMM
     - @ref NEGEMMAssemblyDispatch
+    - @ref NEGEMMConv2d
     - @ref NEGEMMConvolutionLayer
-    - @ref NEGEMMLowpAssemblyMatrixMultiplyCore
     - @ref NEGEMMLowpMatrixMultiplyCore
     - @ref NEGenerateProposalsLayer
     - @ref NEHarrisCorners
@@ -173,7 +176,6 @@ namespace arm_compute
     - @ref NERNNLayer
     - @ref NEROIPoolingLayer
     - @ref NEScale
-    - @ref NESimpleAssemblyFunction
     - @ref NESobel5x5
     - @ref NESobel7x7
     - @ref NESoftmaxLayerGeneric &lt;IS_LOG&gt;
@@ -231,6 +233,9 @@ namespace arm_compute
     - @ref CLLaplacianPyramid
     - @ref CLLaplacianReconstruct
     - @ref CLLocallyConnectedLayer
+    - @ref CLLogicalAnd
+    - @ref CLLogicalNot
+    - @ref CLLogicalOr
     - @ref CLLSTMLayer
     - @ref CLLSTMLayerQuantized
     - @ref CLQLSTMLayer
@@ -300,6 +305,7 @@ namespace arm_compute
         - @ref CLGaussian3x3
         - @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
         - @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
+        - @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
         - @ref CLMagnitude
         - @ref CLMeanStdDevNormalizationLayer
         - @ref CLMedian3x3
diff --git a/docs/07_errata.dox b/docs/07_errata.dox
index 2d35e67986..994b8c5bd7 100644
--- a/docs/07_errata.dox
+++ b/docs/07_errata.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2019 Arm Limited.
+/// Copyright (c) 2019-2020 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -30,6 +30,11 @@ namespace arm_compute
 
 @section S7_1_errata Errata
 
+- Under certain conditions, the validation test case 'CL/DirectConvolutionLayer/Float/FP32/RunSmall9x9\@InputShape=32x37x3x4:StrideX=1:StrideY=1:PadX=0:PadY=0:KernelSize=9:NumKernels=1:DataType=F32:ActivationInfo=LU_BOUNDED_RELU:DataLayout=NHWC' may fail.
+    - Versions Affected: >= v20.08
+    - Conditions:
+        - The validation suite has to run in nightly mode and execute 40k+ test cases before the test mentioned above
+
 - Under certain conditions, benchmark examples can hang when OpenCL profiling queues are enabled.
     - Versions Affected: >= v19.11
     - OSs Affected: Linux
diff --git a/docs/ComputeLibrary.dir b/docs/ComputeLibrary.dir
new file mode 100644
index 0000000000..7733e531cd
--- /dev/null
+++ b/docs/ComputeLibrary.dir
@@ -0,0 +1,360 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+/** @file Android.bp
+ *  @brief Generation script for building AndroidNN driver.
+ */
+
+/** @dir arm_compute
+ *  @brief All the arm_compute headers.
+ */
+
+/** @dir arm_compute/core
+ *  @brief Core module: common basic types and kernels.
+ */
+
+/** @dir arm_compute/core/CL
+ *  @brief OpenCL backend core: kernels and utilities.
+ */
+
+/** @file arm_compute/core/CL/CLKernelLibrary.h
+ *  @brief Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
+ */
+
+/** @file arm_compute/core/CL/OpenCL.h
+ *  @brief Wrapper to configure the Khronos OpenCL C++ header
+ */
+
+/** @dir arm_compute/core/CPP
+ *  @brief CPP backend core: kernels and utilities.
+ */
+
+/** @file arm_compute/core/CPP/CPPKernels.h
+ *  @brief Includes all the CPP kernels at once
+ */
+
+/** @dir arm_compute/core/CPP/kernels
+ *  @brief Folder containing all the CPP kernels
+ */
+
+/** @dir arm_compute/core/experimental
+ *  @brief All experimental interfaces
+ */
+
+/** @dir arm_compute/core/GLES_COMPUTE
+ *  @brief OpenGLES backend core: kernels and utilities.
+ */
+
+/** @file arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
+ *  @brief Manages all the GLES kernels compilation and caching, provides accessors for the GLES Context.
+ */
+
+/** @file arm_compute/core/GLES_COMPUTE/GCKernels.h
+ *  @brief Includes all the GLES kernels at once
+ */
+
+/** @file arm_compute/core/GLES_COMPUTE/OpenGLES.h
+ *  @brief Wrapper to configure the Khronos EGL and OpenGL ES C header
+ */
+
+/** @dir arm_compute/core/GLES_COMPUTE/kernels
+ *  @brief Folder containing all the GLES kernels
+ */
+
+/** @dir src/core/NEON
+ *  @brief NEON backend core: kernels and utilities.
+ */
+
+/** @file src/core/NEON/NEKernels.h
+ *  @brief Includes all the NEON kernels at once
+ */
+
+/** @dir src/core/NEON/kernels
+ *  @brief Folder containing all the NEON kernels
+ */
+
+/** @dir arm_compute/core/utils
+ *  @brief Common core utilities.
+ */
+
+/** @dir arm_compute/graph
+ *  @brief Graph API.
+ */
+
+/** @dir arm_compute/graph/algorithms
+ *  @brief Generic algorithms used by the graph backend (e.g Order of traversal)
+ */
+
+/** @dir arm_compute/graph/backends
+ *  @brief The backend specific code
+ */
+
+/** @dir arm_compute/graph/backends/CL
+ *  @brief OpenCL specific operations
+ */
+
+/** @dir arm_compute/graph/backends/GLES
+ *  @brief OpenGLES specific operations
+ */
+
+/** @dir arm_compute/graph/backends/NEON
+ *  @brief NEON specific operations
+ */
+
+/** @dir arm_compute/graph/detail
+ *  @brief Collection of internal utilities.
+ */
+
+/** @dir arm_compute/graph/frontend
+ *  @brief Code related to the stream frontend interface.
+ */
+
+/** @dir arm_compute/graph/mutators
+ *  @brief Used to modify / optimise the Graph intermediate representation(Operator fusion, in place operations, etc.)
+ */
+
+/** @dir arm_compute/graph/nodes
+ *  @brief The various nodes supported by the graph API.
+ */
+
+/** @dir arm_compute/graph/printers
+ *  @brief Debug printers.
+ */
+
+/** @file arm_compute/graph.h
+ *  @brief Includes all the Graph headers at once.
+ */
+
+/** @dir arm_compute/runtime
+ *  @brief Runtime interface: memory, scheduler, functions.
+ */
+
+/** @dir arm_compute/runtime/CL
+ *  @brief OpenCL backend runtime interface.
+ */
+
+/** @file arm_compute/runtime/CL/CLFunctions.h
+ *  @brief Includes all the OpenCL functions at once
+ */
+
+/** @file arm_compute/runtime/CL/CLScheduler.h
+ *  @brief Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
+ */
+
+/** @file arm_compute/runtime/CL/ICLTuner.h
+ *  @brief Interface used to tune the local work-group size of OpenCL kernels.
+ */
+
+/** @dir arm_compute/runtime/CL/functions
+ *  @brief Folder containing all the OpenCL functions.
+ */
+
+/** @dir arm_compute/runtime/CL/tuners
+ *  @brief Local workgroup size tuners for specific architectures / GPUs.
+ */
+
+/** @dir arm_compute/runtime/CPP
+ *  @brief CPP backend runtime interface.
+ */
+
+/** @file arm_compute/runtime/CPP/CPPScheduler.h
+ *  @brief Basic pool of threads to execute CPP/NEON code on several cores in parallel.
+ */
+
+/** @dir arm_compute/runtime/CPP/functions
+ *  @brief Folder containing all the CPP functions.
+ */
+
+/** @dir arm_compute/runtime/experimental
+ *  @brief Experimental runtime interface.
+ */
+
+/** @dir arm_compute/runtime/GLES_COMPUTE
+ *  @brief OpenGLES backend runtime interface.
+ */
+
+/** @file arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
+ *  @brief Includes all the OpenGLES functions at once
+ */
+
+/** @file arm_compute/runtime/GLES_COMPUTE/GCScheduler.h
+ *  @brief Interface to enqueue GLES kernels and get/set the GLES CommandQueue.
+ */
+
+/** @dir arm_compute/runtime/GLES_COMPUTE/functions
+ *  @brief Folder containing all the GLES functions.
+ */
+
+/** @dir arm_compute/runtime/NEON
+ *  @brief NEON backend runtime interface.
+ */
+
+/** @file arm_compute/runtime/NEON/NEFunctions.h
+ *  @brief Includes all the NEON functions at once.
+ */
+
+/** @dir arm_compute/runtime/NEON/functions
+ *  @brief Folder containing all the NEON functions.
+ */
+
+/** @dir arm_compute/runtime/OMP
+ *  @brief OpenMP backend runtime interface.
+ */
+
+/** @file arm_compute/runtime/OMP/OMPScheduler.h
+ *  @brief OpenMP scheduler (Alternative to the CPPScheduler).
+ */
+
+/** @dir arm_compute/runtime/common
+ *  @brief Common utility code used by all backends.
+ */
+
+/** @dir docs
+ *  @brief Doxyfile and Doxygen sources used to generate this documentation.
+ */
+
+/** @dir ./examples
+ *  @brief Set of examples using the Compute Library
+ *
+ *  @details Examples have the following structure:
+ *
+ *  -# cl_*.cpp --> OpenCL examples
+ *  -# gc_*.cpp --> GLES compute shaders examples
+ *  -# graph_*.cpp --> Graph examples
+ *  -# neoncl_*.cpp --> NEON / OpenCL interoperability examples
+ *  -# neon_*.cpp --> NEON examples
+ */
+
+/** @dir examples/gemm_tuner
+ *  @brief OpenCL GEMM tuner utility.
+ */
+
+/** @dir scripts
+ *  @brief Utility scripts.
+ */
+
+/** @file scripts/caffe_data_extractor.py
+ *  @brief Basic script to export weights from Caffe to npy files.
+ */
+
+/** @file scripts/tensorflow_data_extractor.py
+ *  @brief Basic script to export weights from TensorFlow to npy files.
+ */
+
+/** @dir src
+ *  @brief Source code implementing all the arm_compute headers.
+ */
+
+/** @dir src/core/NEON/kernels/detail
+ *  @brief Common code for several intrinsics implementations.
+ */
+
+/** @dir src/core/NEON/wrapper
+ *  @brief NEON wrapper used to simplify code
+ */
+
+/** @file src/core/NEON/wrapper/traits.h
+ *  @brief Traits defined on NEON vectors
+ */
+
+/** @file src/core/NEON/wrapper/wrapper.h
+ *  @brief Includes all wrapper headers at once
+ */
+
+/** @dir src/core/NEON/wrapper/intrinsics
+ *  @brief NEON intrinsics wrappers
+ */
+
+/** @dir src/core/NEON/wrapper/scalar
+ *  @brief Scalar operations
+ */
+
+/** @dir src/core/CL/gemm
+ *  @brief Folder containing all the configuration files for GEMM
+ */
+
+/** @dir src/core/CL/kernels
+ *  @brief All the OpenCL kernels
+ */
+
+/** @dir support
+ *  @brief Various headers to work around toolchains / platform issues.
+ */
+
+/** @dir tests
+ *  @brief All test related files shared between validation and benchmark.
+ */
+
+/** @file tests/main.cpp
+ *  @brief Main entry point for the tests. Currently shared between validation and benchmarking.
+ */
+
+/** @dir tests/CL
+ *  @brief OpenCL accessors.
+ */
+
+/** @dir tests/GLES_COMPUTE
+ *  @brief GLES accessors.
+ */
+
+/** @dir tests/NEON
+ *  @brief NEON accessors.
+ */
+
+/** @dir tests/benchmark
+ *  @brief Sources for benchmarking.
+ */
+
+/** @dir tests/benchmark/CL
+ *  @brief OpenCL benchmarking tests.
+ */
+
+/** @dir tests/benchmark/GLES_COMPUTE
+ *  @brief GLES benchmarking tests.
+ */
+
+/** @dir tests/benchmark/NEON
+ *  @brief NEON benchmarking tests.
+ */
+
+/** @dir tests/benchmark_examples
+ *  @brief Sources needed to wrap examples to run through our benchmarking framework.
+ */
+
+/** @dir tests/framework
+ *  @brief Boiler plate code for both validation and benchmark test suites (Command line parsers, instruments, output loggers, etc.)
+ */
+
+/** @dir tests/instruments
+ *  @brief User defined instruments that can be registered to the framework.
+ */
+
+/** @dir tests/validate_examples
+ *  @brief Sources needed to wrap examples to run through our validation framework.
+ */
+
+/** @dir tests/validation
+ *  @brief Source for validation.
+ */
+
+/** @dir tests/validation/CL
+ *  @brief OpenCL validation tests.
+ */
+
+/** @dir tests/validation/CPP
+ *  @brief C++ validation tests.
+ */
+
+/** @dir tests/validation/GLES_COMPUTE
+ *  @brief GLES validation tests.
+ */
+
+/** @dir tests/validation/NEON
+ *  @brief NEON validation tests.
+ */
+
+/** @dir tests/validation/reference
+ *  @brief Reference implementation used to validate the results of the various backends.
+ */
diff --git a/docs/Doxyfile b/docs/Doxyfile
index ef8966c5be..323ed2101c 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "Compute Library"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 20.08
+PROJECT_NUMBER         = 20.11
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.
 
-EXTENSION_MAPPING      = cl=C
+EXTENSION_MAPPING      = cl=C bp=C dir=C
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -776,11 +776,14 @@ INPUT                  = ./docs/00_introduction.dox \
                          ./docs/05_contribution_guidelines.dox \
                          ./docs/06_functions_list.dox \
                          ./docs/07_errata.dox \
+                         ./docs/ComputeLibrary.dir \
                          ./arm_compute/ \
                          ./src/ \
                          ./examples/ \
                          ./tests/ \
                          ./utils/ \
+                         ./Android.bp \
+                         ./scripts \
                          ./support/
 
 # This tag can be used to specify the character encoding of the source files
@@ -843,6 +846,8 @@ FILE_PATTERNS          = *.c \
                          *.qsf \
                          *.as \
                          *.js \
+                         *.bp \
+                         *.dir \
                          *.cl
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
diff --git a/examples/SConscript b/examples/SConscript
index e28761cd50..225abbb9b1 100644
--- a/examples/SConscript
+++ b/examples/SConscript
@@ -48,6 +48,10 @@ else:
     arm_compute_dependency = arm_compute_so
     graph_dependency = [arm_compute_graph_so]
 
+extra_link_flags = []
+if env['os'] != 'bare_metal':
+    extra_link_flags += ['-fstack-protector-strong']
+
 # Build graph examples
 graph_utils = examples_env.Object("../utils/GraphUtils.cpp")
 graph_utils += examples_env.Object("../utils/CommonGraphOptions.cpp")
@@ -57,7 +61,7 @@ for file in Glob("./graph_*.cpp"):
     prog = None
 
     if env['os'] in ['android', 'bare_metal'] or env['standalone']:
-        prog = examples_env.Program(example, ["{}.cpp".format(example), utils, graph_utils], LIBS = examples_libs + arm_compute_graph_libs, LINKFLAGS=examples_env["LINKFLAGS"]+['-Wl,--whole-archive',graph_dependency,'-Wl,--no-whole-archive', '-fstack-protector-strong'])
+        prog = examples_env.Program(example, ["{}.cpp".format(example), utils, graph_utils], LIBS = examples_libs + arm_compute_graph_libs, LINKFLAGS=examples_env["LINKFLAGS"]+['-Wl,--whole-archive',graph_dependency,'-Wl,--no-whole-archive'] + extra_link_flags)
         Depends(prog, graph_dependency)
         prog = install_bin(prog)
     else:
@@ -109,7 +113,7 @@ if env['neon']:
 
         prog = None
         if env['os'] in ['bare_metal']:
-            prog = examples_env.Program(example, ["{}.cpp".format(example), utils], LINKFLAGS=examples_env["LINKFLAGS"]+['-fstack-protector'], LIBS = examples_libs + arm_compute_libs)
+            prog = examples_env.Program(example, ["{}.cpp".format(example), utils], LINKFLAGS=examples_env["LINKFLAGS"], LIBS = examples_libs + arm_compute_libs)
         else:
             prog = examples_env.Program(example, ["{}.cpp".format(example), utils], LIBS = examples_libs + arm_compute_libs)
 
@@ -126,3 +130,4 @@ if env['gles_compute']:
         prog = install_bin(prog)
         alias = examples_env.Alias(example, prog)
         Default(alias)
+
diff --git a/examples/cl_cache.cpp b/examples/cl_cache.cpp
index 37e1c270d7..6de62f7c5d 100644
--- a/examples/cl_cache.cpp
+++ b/examples/cl_cache.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/CLFunctions.h"
-
+#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/Utils.h"
+#include "arm_compute/runtime/CL/functions/CLPermute.h"
 #include "utils/Utils.h"
 
 using namespace arm_compute;
diff --git a/examples/cl_convolution.cpp b/examples/cl_convolution.cpp
index 34b3466f77..bfa53f3379 100644
--- a/examples/cl_convolution.cpp
+++ b/examples/cl_convolution.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,9 +25,10 @@
 #error "This example needs to be built with -DARM_COMPUTE_CL"
 #endif /* ARM_COMPUTE_CL */
 
+#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLConvolution.h"
 #include "utils/ImageLoader.h"
 #include "utils/Utils.h"
 
diff --git a/examples/cl_events.cpp b/examples/cl_events.cpp
index f578180869..27c063cbc9 100644
--- a/examples/cl_events.cpp
+++ b/examples/cl_events.cpp
@@ -26,8 +26,10 @@
 #endif /* ARM_COMPUTE_CL */
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
+#include "arm_compute/runtime/CL/functions/CLScale.h"
 #include "utils/ImageLoader.h"
 #include "utils/Utils.h"
 
diff --git a/examples/cl_sgemm.cpp b/examples/cl_sgemm.cpp
index 7d3b4fe97f..27af228954 100644
--- a/examples/cl_sgemm.cpp
+++ b/examples/cl_sgemm.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,9 +26,9 @@
 #endif /* ARM_COMPUTE_CL */
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "utils/Utils.h"
 
 #include <cstdlib>
diff --git a/examples/gemm_tuner/CommonGemmExampleOptions.h b/examples/gemm_tuner/CommonGemmExampleOptions.h
index 04a8f22be6..5c4be286d6 100644
--- a/examples/gemm_tuner/CommonGemmExampleOptions.h
+++ b/examples/gemm_tuner/CommonGemmExampleOptions.h
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_EXAMPLES_GEMM_TUNER_COMMON_GEMM_EXAMPLE_OPTIONS
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/graph/TypeLoader.h"
+#include "arm_compute/core/Utils.h"
 #include "utils/TypePrinter.h"
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
diff --git a/examples/gemm_tuner/GemmTunerHelpers.h b/examples/gemm_tuner/GemmTunerHelpers.h
new file mode 100644
index 0000000000..23cf14cf18
--- /dev/null
+++ b/examples/gemm_tuner/GemmTunerHelpers.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef EXAMPLES_GEMMTUNERHELPERS_H
+#define EXAMPLES_GEMMTUNERHELPERS_H
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
+namespace examples
+{
+namespace gemm_tuner_helpers
+{
+void update_padding_for_cl_image(arm_compute::ITensorInfo *tensor)
+{
+    constexpr unsigned int num_floats_per_pixel = 4;
+
+    const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size();
+    const unsigned int pixel_aligment       = arm_compute::get_cl_image_pitch_alignment(
+                                                  arm_compute::CLKernelLibrary::get().get_device());
+    const unsigned int row_pitch_alignment = pixel_aligment * num_floats_per_pixel;
+    const unsigned int round_up_width =
+        ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
+    const unsigned int padding = round_up_width - stride_y_in_elements;
+
+    tensor->extend_padding(arm_compute::PaddingSize(0, padding, 0, 0));
+}
+} // namespace gemm_tuner_helpers
+} // namespace examples
+
+#endif /* EXAMPLES_GEMMTUNERHELPERS_H */
diff --git a/examples/gemm_tuner/README.md b/examples/gemm_tuner/README.md
index 1effd2f7e1..73bddc9239 100644
--- a/examples/gemm_tuner/README.md
+++ b/examples/gemm_tuner/README.md
@@ -34,7 +34,7 @@ what kernel and subsequently what configurations for that kernels are the most p
 ### Step1: Prepare the shape and configs files
 1. We first need to identify the shapes that we are interested in and store them in a csv file, say *gemm_shapes.csv*.
 2. Then we need to specify a set of good GEMMConfig candidates for each kernel in 3 separate csv files (this requires
-    some prior heuristics, but can be provided by the ACL developers upon requests, based on your target device).
+    some prior heuristics, but can be provided by the Compute Library developers upon requests, based on your target device).
 
    Say we have *gemm_configs_native.csv", "gemm_configs_reshaped.csv" and "gemm_configs_reshaped_only_rhs.csv".
 
@@ -42,9 +42,9 @@ what kernel and subsequently what configurations for that kernels are the most p
 
 ### Step2: Push relevant files to the target device
 All the files that need to be present on the target device are:
-* benchmark script: \<ACL\>/examples/gemm_tuner/benchmark_gemm_examples.sh
+* benchmark script: \<ComputeLibrary\>/examples/gemm_tuner/benchmark_gemm_examples.sh
 * shapes and configs csv files: gemm_shapes.csv, gemm_configs_native.csv, gemm_configs_reshaped_only_rhs.csv, gemm_configs_reshaped.csv
-* Example benchmark binaries: \<ACL\>/build/tests/gemm_tuner/benchmark_cl_gemm*
+* Example benchmark binaries: \<ComputeLibrary\>/build/tests/gemm_tuner/benchmark_cl_gemm*
 
 ### Step3: Collect benchmark data
 With these files on device, we can collect benchmark data using the script. Assume all the example binaries are pushed
@@ -64,7 +64,7 @@ but you may need to change the output folder for each repeat
 1. After benchmarking, we pull the benchmark data, the *results* folder, from the target device to our host machine
 2. We use the GemmTuner.py script to give us the heuristics
    ```
-   python3 <ACL>/examples/gemm_tuner/GemmTuner.py -b ./results -o heuristics
+   python3 <ComputeLibrary>/examples/gemm_tuner/GemmTuner.py -b ./results -o heuristics
    ```
    When it's finished, there should be 4 json files in the *heuristics* folder
 
@@ -76,12 +76,12 @@ passing a lower value to *-t \<tolerance\>* to the GemmTuner.py script.
 * A target device to be tuned, plus the following on the device:
     * Android or Linux OS
     * Bash shell
-    * Built ACL with benchmark examples binaries
+    * Built Compute Library with benchmark examples binaries
     * benchmark_gemm_examples.sh script
     * gemm shape file
 
        A csv file containing the **GEMMParam search list**. This is the list of GEMMParams/gemm shapes that we're
-       interested in (For more details see Approach section). The default list is prepared by ACL developers in advance
+       interested in (For more details see Approach section). The default list is prepared by Compute Library developers in advance
        and can be provided on request.
 
        The format is described as:
@@ -105,7 +105,7 @@ passing a lower value to *-t \<tolerance\>* to the GemmTuner.py script.
     * gemm config file  
       A csv file containing the **GEMMConfig search list**. This is the list of candidate GEMMConfigs among which we
       search for the optimal one. **Note that we have a different list for each strategy.**
-      The default lists are prepared by ACL developers in advance and can be provided on request.
+      The default lists are prepared by Compute Library developers in advance and can be provided on request.
 
       The format of the file for each strategy is the same:  
 
diff --git a/examples/gemm_tuner/cl_gemm_native.cpp b/examples/gemm_tuner/cl_gemm_native.cpp
index 43035082a4..02f144ea12 100644
--- a/examples/gemm_tuner/cl_gemm_native.cpp
+++ b/examples/gemm_tuner/cl_gemm_native.cpp
@@ -26,14 +26,13 @@
 #endif /* ARM_COMPUTE_CL */
 
 #include "CommonGemmExampleOptions.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
 #include "tests/CL/Helper.h"
 #include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
diff --git a/examples/gemm_tuner/cl_gemm_reshaped.cpp b/examples/gemm_tuner/cl_gemm_reshaped.cpp
index 9c6568cffb..a4d6203d5c 100644
--- a/examples/gemm_tuner/cl_gemm_reshaped.cpp
+++ b/examples/gemm_tuner/cl_gemm_reshaped.cpp
@@ -25,17 +25,16 @@
 #error "This example needs to be built with -DARM_COMPUTE_CL"
 #endif /* ARM_COMPUTE_CL */
 
-#include "CommonGemmExampleOptions.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+#include "examples/gemm_tuner/CommonGemmExampleOptions.h"
+#include "examples/gemm_tuner/GemmTunerHelpers.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 #include "tests/CL/Helper.h"
 #include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
@@ -254,14 +253,14 @@ class CLGEMMMatrixMultiplyReshapedExample : public Example
         kernel_info.activation_info         = act_info;
 
         // Initialise lhs_reshaped tensor info
-        auto_init_if_empty(*lhs_reshaped.info(), lhs.info()->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*lhs.info(), lhs_info)));
+        lhs_reshaped.allocator()->init(TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type));
 
         // Initialise rhs_reshaped tensor info
-        auto_init_if_empty(*rhs_reshaped.info(), rhs.info()->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*rhs.info(), rhs_info)));
+        rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
 
         if(rhs_info.export_to_cl_image)
         {
-            arm_compute::cl_gemm::update_padding_for_cl_image(rhs_reshaped.info());
+            examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info());
         }
 
         // Validate argments
diff --git a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp
index f814c541c4..cf65d0dd33 100644
--- a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp
+++ b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp
@@ -26,15 +26,14 @@
 #endif /* ARM_COMPUTE_CL */
 
 #include "CommonGemmExampleOptions.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "GemmTunerHelpers.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
 #include "tests/CL/Helper.h"
 #include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
@@ -224,11 +223,11 @@ class CLGEMMMatrixMultiplyReshapedOnlyRHSExample : public Example
         kernel_info.activation_info         = act_info;
 
         // Initialise rhs_reshaped tensor info
-        auto_init_if_empty(*rhs_reshaped.info(), rhs.info()->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*rhs.info(), rhs_info)));
+        rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
 
         if(rhs_info.export_to_cl_image)
         {
-            arm_compute::cl_gemm::update_padding_for_cl_image(rhs_reshaped.info());
+            examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info());
         }
 
         // Validate argments
diff --git a/examples/graph_edsr.cpp b/examples/graph_edsr.cpp
index 3868f0f7c4..77783d97ed 100644
--- a/examples/graph_edsr.cpp
+++ b/examples/graph_edsr.cpp
@@ -102,6 +102,27 @@ class GraphEdsrExample : public Example
     GraphEdsr model{};
 };
 
+/** Internal implementation of UINT8 EDSR with some modifications from the paper.
+ * The sub-pixel convolution has been replaced with a deconvolution layer. This
+ * operation is mathematically the same.
+ *
+ * Convolution replaced by deconvolution:
+ *      https://arxiv.org/abs/1609.07009
+ *      "Is the deconvolution layer the same as a convolutional layer?"
+ *      Wenzhe Shi, Jose Caballero, Lucas Theis, Ferenc Huszar, Andrew Aitken, Christian Ledig, Zehan Wang
+ *
+ * Original model is:
+ *      https://arxiv.org/abs/1707.02921
+ *      "Enhanced Deep Residual Networks for Single Image Super-Resolution"
+ *      Bee Lim, Sanghyun Son, Heewon Kim, Seungjun Nah, Kyoung Mu Lee
+ *
+ * @note To list all the possible arguments execute the binary appended with the --help option
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments
+ *
+ * @return Return code
+ */
 int main(int argc, char **argv)
 {
     return run_example<GraphEdsrExample>(argc, argv);
diff --git a/examples/graph_edsr.h b/examples/graph_edsr.h
index 42a2789861..72012afdcb 100644
--- a/examples/graph_edsr.h
+++ b/examples/graph_edsr.h
@@ -105,14 +105,10 @@ class GraphEdsr
         node_post_residual_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/post_residual_FakeQuantWithMinMaxVars.npy",
                                                                                                  DataLayout::NHWC));
 
-        TensorShape scalar_4d_shape{};
-
-        scalar_4d_shape.set(0, 1, false).set(1, 1, false).set(2, 1, false).set(3, 1, false);
-
         NodeID id_mul_15_y = _graph.add_node<ConstNode>(
                                  TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -146,7 +142,7 @@ class GraphEdsr
         NodeID id_mul_14_y = _graph.add_node<ConstNode>(
                                  TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -180,7 +176,7 @@ class GraphEdsr
         NodeID id_mul_13_y = _graph.add_node<ConstNode>(
                                  TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -214,7 +210,7 @@ class GraphEdsr
         NodeID id_mul_12_y = _graph.add_node<ConstNode>(
                                  TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -248,7 +244,7 @@ class GraphEdsr
         NodeID id_mul_11_y = _graph.add_node<ConstNode>(
                                  TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -282,7 +278,7 @@ class GraphEdsr
         NodeID id_mul_10_y = _graph.add_node<ConstNode>(
                                  TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -316,7 +312,7 @@ class GraphEdsr
         NodeID id_mul_9_y = _graph.add_node<ConstNode>(
                                 TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -350,7 +346,7 @@ class GraphEdsr
         NodeID id_mul_8_y = _graph.add_node<ConstNode>(
                                 TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -384,7 +380,7 @@ class GraphEdsr
         NodeID id_mul_7_y = _graph.add_node<ConstNode>(
                                 TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -418,7 +414,7 @@ class GraphEdsr
         NodeID id_mul_6_y = _graph.add_node<ConstNode>(
                                 TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -452,7 +448,7 @@ class GraphEdsr
         NodeID id_mul_5_y = _graph.add_node<ConstNode>(
                                 TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -486,7 +482,7 @@ class GraphEdsr
         NodeID id_mul_4_y = _graph.add_node<ConstNode>(
                                 TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -520,7 +516,7 @@ class GraphEdsr
         NodeID id_mul_3_y = _graph.add_node<ConstNode>(
                                 TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -554,7 +550,7 @@ class GraphEdsr
         NodeID id_mul_2_y = _graph.add_node<ConstNode>(
                                 TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -588,7 +584,7 @@ class GraphEdsr
         NodeID id_mul_1_y = _graph.add_node<ConstNode>(
                                 TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
@@ -622,7 +618,7 @@ class GraphEdsr
         NodeID id_mul_y = _graph.add_node<ConstNode>(
                               TensorDescriptor
         {
-            scalar_4d_shape,
+            TensorShape{ 1 },
             DataType::QASYMM8,
             QuantizationInfo(0.0003921568568330258),
             DataLayout::NHWC });
diff --git a/examples/graph_yolov3_output_detector.cpp b/examples/graph_yolov3_output_detector.cpp
deleted file mode 100644
index 6278565aa3..0000000000
--- a/examples/graph_yolov3_output_detector.cpp
+++ /dev/null
@@ -1,626 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/graph.h"
-#include "arm_compute/graph/Utils.h"
-
-#include "support/ToolchainSupport.h"
-#include "utils/CommonGraphOptions.h"
-#include "utils/GraphUtils.h"
-#include "utils/Utils.h"
-
-using namespace arm_compute::graph;
-using namespace arm_compute::utils;
-
-class GraphYoloV3OutputDetector
-{
-public:
-    GraphYoloV3OutputDetector()
-        : _graph(0, "GraphYoloV3OutputDetector")
-    {
-    }
-
-    bool setup(const CommonGraphParams &common_params, const SimpleOption<std::string> &expected_output_filename)
-    {
-        using namespace arm_compute;
-        using namespace graph_utils;
-
-        const DataLayout  data_layout = common_params.data_layout;
-        const std::string data_path   = common_params.data_path;
-        const Target      target      = common_params.target;
-
-        const DataLayoutDimension x_dim = (data_layout == DataLayout::NHWC) ? DataLayoutDimension::CHANNEL : DataLayoutDimension::WIDTH;
-        const DataLayoutDimension y_dim = (data_layout == DataLayout::NHWC) ? DataLayoutDimension::WIDTH : DataLayoutDimension::HEIGHT;
-
-        NodeID id_ConstantFolding_truediv_1_recip = _graph.add_node<ConstNode>(
-                                                        TensorDescriptor
-        {
-            TensorShape{ 1, 1, 1 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_ConstantFolding_truediv_1_recip = _graph.node(id_ConstantFolding_truediv_1_recip);
-        node_ConstantFolding_truediv_1_recip->set_common_node_parameters(NodeParams{ "ConstantFolding_truediv_1_recip", target });
-        node_ConstantFolding_truediv_1_recip->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/ConstantFolding_truediv_1_recip.npy", data_layout));
-
-        NodeID id_ConstantFolding_truediv_recip = _graph.add_node<ConstNode>(
-                                                      TensorDescriptor
-        {
-            TensorShape{ 1, 1, 1 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_ConstantFolding_truediv_recip = _graph.node(id_ConstantFolding_truediv_recip);
-        node_ConstantFolding_truediv_recip->set_common_node_parameters(NodeParams{ "ConstantFolding_truediv_recip", target });
-        node_ConstantFolding_truediv_recip->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/ConstantFolding_truediv_recip.npy", data_layout));
-
-        NodeID id_detector_yolo_v3_mul_6_y = _graph.add_node<ConstNode>(
-                                                 TensorDescriptor
-        {
-            TensorShape{ 2 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_detector_yolo_v3_mul_6_y = _graph.node(id_detector_yolo_v3_mul_6_y);
-        node_detector_yolo_v3_mul_6_y->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_6_y", target });
-        node_detector_yolo_v3_mul_6_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_mul_6_y.npy", data_layout));
-
-        NodeID id_detector_yolo_v3_mul_3_y = _graph.add_node<ConstNode>(
-                                                 TensorDescriptor
-        {
-            TensorShape{ 2 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_detector_yolo_v3_mul_3_y = _graph.node(id_detector_yolo_v3_mul_3_y);
-        node_detector_yolo_v3_mul_3_y->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_3_y", target });
-        node_detector_yolo_v3_mul_3_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_mul_3_y.npy", data_layout));
-
-        NodeID id_detector_yolo_v3_mul_y = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 2 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_detector_yolo_v3_mul_y = _graph.node(id_detector_yolo_v3_mul_y);
-        node_detector_yolo_v3_mul_y->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_y", target });
-        node_detector_yolo_v3_mul_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_mul_y.npy", data_layout));
-
-        NodeID id_detector_yolo_v3_mul_7 = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 2, 8112 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_detector_yolo_v3_mul_7 = _graph.node(id_detector_yolo_v3_mul_7);
-        node_detector_yolo_v3_mul_7->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_7", target });
-        node_detector_yolo_v3_mul_7->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_mul_7.npy", data_layout));
-
-        NodeID id_detector_yolo_v3_Reshape_11 = _graph.add_node<ConstNode>(
-                                                    TensorDescriptor
-        {
-            TensorShape{ 2, 8112 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_detector_yolo_v3_Reshape_11 = _graph.node(id_detector_yolo_v3_Reshape_11);
-        node_detector_yolo_v3_Reshape_11->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Reshape_11", target });
-        node_detector_yolo_v3_Reshape_11->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_Reshape_11.npy", data_layout));
-
-        NodeID id_detector_yolo_v3_mul_4 = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 2, 2028 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_detector_yolo_v3_mul_4 = _graph.node(id_detector_yolo_v3_mul_4);
-        node_detector_yolo_v3_mul_4->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_4", target });
-        node_detector_yolo_v3_mul_4->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_mul_4.npy", data_layout));
-
-        NodeID id_detector_yolo_v3_Reshape_7 = _graph.add_node<ConstNode>(
-                                                   TensorDescriptor
-        {
-            TensorShape{ 2, 2028 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_detector_yolo_v3_Reshape_7 = _graph.node(id_detector_yolo_v3_Reshape_7);
-        node_detector_yolo_v3_Reshape_7->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Reshape_7", target });
-        node_detector_yolo_v3_Reshape_7->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_Reshape_7.npy", data_layout));
-
-        NodeID id_detector_yolo_v3_mul_1 = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 2, 507 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_detector_yolo_v3_mul_1 = _graph.node(id_detector_yolo_v3_mul_1);
-        node_detector_yolo_v3_mul_1->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_1", target });
-        node_detector_yolo_v3_mul_1->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_mul_1.npy", data_layout));
-
-        NodeID id_detector_yolo_v3_Reshape_3 = _graph.add_node<ConstNode>(
-                                                   TensorDescriptor
-        {
-            TensorShape{ 2, 507 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_detector_yolo_v3_Reshape_3 = _graph.node(id_detector_yolo_v3_Reshape_3);
-        node_detector_yolo_v3_Reshape_3->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Reshape_3", target });
-        node_detector_yolo_v3_Reshape_3->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_Reshape_3.npy", data_layout));
-
-        NodeID id_input_to_detector_3 = _graph.add_node<InputNode>(
-                                            TensorDescriptor
-        {
-            TensorShape{ 255, 52, 52, 1 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_input_to_detector_3 = _graph.node(id_input_to_detector_3);
-        node_input_to_detector_3->set_common_node_parameters(NodeParams{ "input_to_detector_3", target });
-        node_input_to_detector_3->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/input_to_detector_3.npy", data_layout));
-
-        NodeID id_detector_yolo_v3_Reshape_10 = _graph.add_node<ReshapeLayerNode>(
-                                                    TensorShape{ 85, 8112 });
-        INode *node_detector_yolo_v3_Reshape_10 = _graph.node(id_detector_yolo_v3_Reshape_10);
-        node_detector_yolo_v3_Reshape_10->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Reshape_10", target });
-        _graph.add_connection(id_input_to_detector_3, 0, id_detector_yolo_v3_Reshape_10, 0);
-
-        NodeID id_detector_yolo_v3_split_2 = _graph.add_node<SplitLayerNode>(
-                                                 4,
-                                                 0,
-                                                 std::vector<int> { 2, 2, 1, 80 });
-        INode *node_detector_yolo_v3_split_2 = _graph.node(id_detector_yolo_v3_split_2);
-        node_detector_yolo_v3_split_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_split_2", target });
-        _graph.add_connection(id_detector_yolo_v3_Reshape_10, 0, id_detector_yolo_v3_split_2, 0);
-
-        NodeID id_detector_yolo_v3_Sigmoid_6 = _graph.add_node<ActivationLayerNode>(
-                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
-        INode *node_detector_yolo_v3_Sigmoid_6 = _graph.node(id_detector_yolo_v3_Sigmoid_6);
-        node_detector_yolo_v3_Sigmoid_6->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_6", target });
-        _graph.add_connection(id_detector_yolo_v3_split_2, 0, id_detector_yolo_v3_Sigmoid_6, 0);
-
-        NodeID id_detector_yolo_v3_add_2 = _graph.add_node<EltwiseLayerNode>(
-                                               descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Add,
-            QuantizationInfo() });
-        INode *node_detector_yolo_v3_add_2 = _graph.node(id_detector_yolo_v3_add_2);
-        node_detector_yolo_v3_add_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_add_2", target });
-        _graph.add_connection(id_detector_yolo_v3_Sigmoid_6, 0, id_detector_yolo_v3_add_2, 0);
-        _graph.add_connection(id_detector_yolo_v3_Reshape_11, 0, id_detector_yolo_v3_add_2, 1);
-
-        NodeID id_detector_yolo_v3_mul_6 = _graph.add_node<EltwiseLayerNode>(
-                                               descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Mul,
-            QuantizationInfo() });
-        INode *node_detector_yolo_v3_mul_6 = _graph.node(id_detector_yolo_v3_mul_6);
-        node_detector_yolo_v3_mul_6->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_6", target });
-        _graph.add_connection(id_detector_yolo_v3_add_2, 0, id_detector_yolo_v3_mul_6, 0);
-        _graph.add_connection(id_detector_yolo_v3_mul_6_y, 0, id_detector_yolo_v3_mul_6, 1);
-
-        NodeID id_detector_yolo_v3_Sigmoid_7 = _graph.add_node<ActivationLayerNode>(
-                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
-        INode *node_detector_yolo_v3_Sigmoid_7 = _graph.node(id_detector_yolo_v3_Sigmoid_7);
-        node_detector_yolo_v3_Sigmoid_7->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_7", target });
-        _graph.add_connection(id_detector_yolo_v3_split_2, 2, id_detector_yolo_v3_Sigmoid_7, 0);
-
-        NodeID id_detector_yolo_v3_Exp_2 = _graph.add_node<UnaryEltwiseLayerNode>(
-                                               descriptors::UnaryEltwiseLayerDescriptor
-        {
-            UnaryEltwiseOperation::Exp,
-            QuantizationInfo() });
-        INode *node_detector_yolo_v3_Exp_2 = _graph.node(id_detector_yolo_v3_Exp_2);
-        node_detector_yolo_v3_Exp_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Exp_2", target });
-        _graph.add_connection(id_detector_yolo_v3_split_2, 1, id_detector_yolo_v3_Exp_2, 0);
-
-        NodeID id_detector_yolo_v3_mul_8 = _graph.add_node<EltwiseLayerNode>(
-                                               descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Mul,
-            QuantizationInfo() });
-        INode *node_detector_yolo_v3_mul_8 = _graph.node(id_detector_yolo_v3_mul_8);
-        node_detector_yolo_v3_mul_8->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_8", target });
-        _graph.add_connection(id_detector_yolo_v3_Exp_2, 0, id_detector_yolo_v3_mul_8, 0);
-        _graph.add_connection(id_detector_yolo_v3_mul_7, 0, id_detector_yolo_v3_mul_8, 1);
-
-        NodeID id_detector_yolo_v3_Sigmoid_8 = _graph.add_node<ActivationLayerNode>(
-                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
-        INode *node_detector_yolo_v3_Sigmoid_8 = _graph.node(id_detector_yolo_v3_Sigmoid_8);
-        node_detector_yolo_v3_Sigmoid_8->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_8", target });
-        _graph.add_connection(id_detector_yolo_v3_split_2, 3, id_detector_yolo_v3_Sigmoid_8, 0);
-
-        NodeID id_detector_yolo_v3_concat_8 = _graph.add_node<ConcatenateLayerNode>(
-                                                  4,
-                                                  descriptors::ConcatLayerDescriptor{ x_dim });
-        INode *node_detector_yolo_v3_concat_8 = _graph.node(id_detector_yolo_v3_concat_8);
-        node_detector_yolo_v3_concat_8->set_common_node_parameters(NodeParams{ "detector_yolo_v3_concat_8", target });
-        _graph.add_connection(id_detector_yolo_v3_mul_6, 0, id_detector_yolo_v3_concat_8, 0);
-        _graph.add_connection(id_detector_yolo_v3_mul_8, 0, id_detector_yolo_v3_concat_8, 1);
-        _graph.add_connection(id_detector_yolo_v3_Sigmoid_7, 0, id_detector_yolo_v3_concat_8, 2);
-        _graph.add_connection(id_detector_yolo_v3_Sigmoid_8, 0, id_detector_yolo_v3_concat_8, 3);
-
-        NodeID id_input_to_detector_2 = _graph.add_node<InputNode>(
-                                            TensorDescriptor
-        {
-            TensorShape{ 255, 26, 26, 1 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_input_to_detector_2 = _graph.node(id_input_to_detector_2);
-        node_input_to_detector_2->set_common_node_parameters(NodeParams{ "input_to_detector_2", target });
-        node_input_to_detector_2->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/input_to_detector_2.npy", data_layout));
-
-        NodeID id_detector_yolo_v3_Reshape_6 = _graph.add_node<ReshapeLayerNode>(
-                                                   TensorShape{ 85, 2028 });
-        INode *node_detector_yolo_v3_Reshape_6 = _graph.node(id_detector_yolo_v3_Reshape_6);
-        node_detector_yolo_v3_Reshape_6->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Reshape_6", target });
-        _graph.add_connection(id_input_to_detector_2, 0, id_detector_yolo_v3_Reshape_6, 0);
-
-        NodeID id_detector_yolo_v3_split_1 = _graph.add_node<SplitLayerNode>(
-                                                 4,
-                                                 0,
-                                                 std::vector<int> { 2, 2, 1, 80 });
-        INode *node_detector_yolo_v3_split_1 = _graph.node(id_detector_yolo_v3_split_1);
-        node_detector_yolo_v3_split_1->set_common_node_parameters(NodeParams{ "detector_yolo_v3_split_1", target });
-        _graph.add_connection(id_detector_yolo_v3_Reshape_6, 0, id_detector_yolo_v3_split_1, 0);
-
-        NodeID id_detector_yolo_v3_Sigmoid_3 = _graph.add_node<ActivationLayerNode>(
-                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
-        INode *node_detector_yolo_v3_Sigmoid_3 = _graph.node(id_detector_yolo_v3_Sigmoid_3);
-        node_detector_yolo_v3_Sigmoid_3->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_3", target });
-        _graph.add_connection(id_detector_yolo_v3_split_1, 0, id_detector_yolo_v3_Sigmoid_3, 0);
-
-        NodeID id_detector_yolo_v3_add_1 = _graph.add_node<EltwiseLayerNode>(
-                                               descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Add,
-            QuantizationInfo() });
-        INode *node_detector_yolo_v3_add_1 = _graph.node(id_detector_yolo_v3_add_1);
-        node_detector_yolo_v3_add_1->set_common_node_parameters(NodeParams{ "detector_yolo_v3_add_1", target });
-        _graph.add_connection(id_detector_yolo_v3_Sigmoid_3, 0, id_detector_yolo_v3_add_1, 0);
-        _graph.add_connection(id_detector_yolo_v3_Reshape_7, 0, id_detector_yolo_v3_add_1, 1);
-
-        NodeID id_detector_yolo_v3_mul_3 = _graph.add_node<EltwiseLayerNode>(
-                                               descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Mul,
-            QuantizationInfo() });
-        INode *node_detector_yolo_v3_mul_3 = _graph.node(id_detector_yolo_v3_mul_3);
-        node_detector_yolo_v3_mul_3->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_3", target });
-        _graph.add_connection(id_detector_yolo_v3_add_1, 0, id_detector_yolo_v3_mul_3, 0);
-        _graph.add_connection(id_detector_yolo_v3_mul_3_y, 0, id_detector_yolo_v3_mul_3, 1);
-
-        NodeID id_detector_yolo_v3_Sigmoid_4 = _graph.add_node<ActivationLayerNode>(
-                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
-        INode *node_detector_yolo_v3_Sigmoid_4 = _graph.node(id_detector_yolo_v3_Sigmoid_4);
-        node_detector_yolo_v3_Sigmoid_4->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_4", target });
-        _graph.add_connection(id_detector_yolo_v3_split_1, 2, id_detector_yolo_v3_Sigmoid_4, 0);
-
-        NodeID id_detector_yolo_v3_Exp_1 = _graph.add_node<UnaryEltwiseLayerNode>(
-                                               descriptors::UnaryEltwiseLayerDescriptor
-        {
-            UnaryEltwiseOperation::Exp,
-            QuantizationInfo() });
-        INode *node_detector_yolo_v3_Exp_1 = _graph.node(id_detector_yolo_v3_Exp_1);
-        node_detector_yolo_v3_Exp_1->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Exp_1", target });
-        _graph.add_connection(id_detector_yolo_v3_split_1, 1, id_detector_yolo_v3_Exp_1, 0);
-
-        NodeID id_detector_yolo_v3_mul_5 = _graph.add_node<EltwiseLayerNode>(
-                                               descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Mul,
-            QuantizationInfo() });
-        INode *node_detector_yolo_v3_mul_5 = _graph.node(id_detector_yolo_v3_mul_5);
-        node_detector_yolo_v3_mul_5->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_5", target });
-        _graph.add_connection(id_detector_yolo_v3_Exp_1, 0, id_detector_yolo_v3_mul_5, 0);
-        _graph.add_connection(id_detector_yolo_v3_mul_4, 0, id_detector_yolo_v3_mul_5, 1);
-
-        NodeID id_detector_yolo_v3_Sigmoid_5 = _graph.add_node<ActivationLayerNode>(
-                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
-        INode *node_detector_yolo_v3_Sigmoid_5 = _graph.node(id_detector_yolo_v3_Sigmoid_5);
-        node_detector_yolo_v3_Sigmoid_5->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_5", target });
-        _graph.add_connection(id_detector_yolo_v3_split_1, 3, id_detector_yolo_v3_Sigmoid_5, 0);
-
-        NodeID id_detector_yolo_v3_concat_5 = _graph.add_node<ConcatenateLayerNode>(
-                                                  4,
-                                                  descriptors::ConcatLayerDescriptor{ x_dim });
-        INode *node_detector_yolo_v3_concat_5 = _graph.node(id_detector_yolo_v3_concat_5);
-        node_detector_yolo_v3_concat_5->set_common_node_parameters(NodeParams{ "detector_yolo_v3_concat_5", target });
-        _graph.add_connection(id_detector_yolo_v3_mul_3, 0, id_detector_yolo_v3_concat_5, 0);
-        _graph.add_connection(id_detector_yolo_v3_mul_5, 0, id_detector_yolo_v3_concat_5, 1);
-        _graph.add_connection(id_detector_yolo_v3_Sigmoid_4, 0, id_detector_yolo_v3_concat_5, 2);
-        _graph.add_connection(id_detector_yolo_v3_Sigmoid_5, 0, id_detector_yolo_v3_concat_5, 3);
-
-        NodeID id_input_to_detector_1 = _graph.add_node<InputNode>(
-                                            TensorDescriptor
-        {
-            TensorShape{ 255, 13, 13, 1 },
-            DataType::F32,
-            QuantizationInfo(),
-            data_layout });
-        INode *node_input_to_detector_1 = _graph.node(id_input_to_detector_1);
-        node_input_to_detector_1->set_common_node_parameters(NodeParams{ "input_to_detector_1", target });
-        node_input_to_detector_1->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/input_to_detector_1.npy", data_layout));
-
-        NodeID id_detector_yolo_v3_Reshape_2 = _graph.add_node<ReshapeLayerNode>(
-                                                   TensorShape{ 85, 507 });
-        INode *node_detector_yolo_v3_Reshape_2 = _graph.node(id_detector_yolo_v3_Reshape_2);
-        node_detector_yolo_v3_Reshape_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Reshape_2", target });
-        _graph.add_connection(id_input_to_detector_1, 0, id_detector_yolo_v3_Reshape_2, 0);
-
-        NodeID id_detector_yolo_v3_split = _graph.add_node<SplitLayerNode>(
-                                               4,
-                                               0,
-                                               std::vector<int> { 2, 2, 1, 80 });
-        INode *node_detector_yolo_v3_split = _graph.node(id_detector_yolo_v3_split);
-        node_detector_yolo_v3_split->set_common_node_parameters(NodeParams{ "detector_yolo_v3_split", target });
-        _graph.add_connection(id_detector_yolo_v3_Reshape_2, 0, id_detector_yolo_v3_split, 0);
-
-        NodeID id_detector_yolo_v3_Sigmoid = _graph.add_node<ActivationLayerNode>(
-                                                 ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
-        INode *node_detector_yolo_v3_Sigmoid = _graph.node(id_detector_yolo_v3_Sigmoid);
-        node_detector_yolo_v3_Sigmoid->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid", target });
-        _graph.add_connection(id_detector_yolo_v3_split, 0, id_detector_yolo_v3_Sigmoid, 0);
-
-        NodeID id_detector_yolo_v3_add = _graph.add_node<EltwiseLayerNode>(
-                                             descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Add,
-            QuantizationInfo() });
-        INode *node_detector_yolo_v3_add = _graph.node(id_detector_yolo_v3_add);
-        node_detector_yolo_v3_add->set_common_node_parameters(NodeParams{ "detector_yolo_v3_add", target });
-        _graph.add_connection(id_detector_yolo_v3_Sigmoid, 0, id_detector_yolo_v3_add, 0);
-        _graph.add_connection(id_detector_yolo_v3_Reshape_3, 0, id_detector_yolo_v3_add, 1);
-
-        NodeID id_detector_yolo_v3_mul = _graph.add_node<EltwiseLayerNode>(
-                                             descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Mul,
-            QuantizationInfo() });
-        INode *node_detector_yolo_v3_mul = _graph.node(id_detector_yolo_v3_mul);
-        node_detector_yolo_v3_mul->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul", target });
-        _graph.add_connection(id_detector_yolo_v3_add, 0, id_detector_yolo_v3_mul, 0);
-        _graph.add_connection(id_detector_yolo_v3_mul_y, 0, id_detector_yolo_v3_mul, 1);
-
-        NodeID id_detector_yolo_v3_Sigmoid_1 = _graph.add_node<ActivationLayerNode>(
-                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
-        INode *node_detector_yolo_v3_Sigmoid_1 = _graph.node(id_detector_yolo_v3_Sigmoid_1);
-        node_detector_yolo_v3_Sigmoid_1->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_1", target });
-        _graph.add_connection(id_detector_yolo_v3_split, 2, id_detector_yolo_v3_Sigmoid_1, 0);
-
-        NodeID id_detector_yolo_v3_Exp = _graph.add_node<UnaryEltwiseLayerNode>(
-                                             descriptors::UnaryEltwiseLayerDescriptor
-        {
-            UnaryEltwiseOperation::Exp,
-            QuantizationInfo() });
-        INode *node_detector_yolo_v3_Exp = _graph.node(id_detector_yolo_v3_Exp);
-        node_detector_yolo_v3_Exp->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Exp", target });
-        _graph.add_connection(id_detector_yolo_v3_split, 1, id_detector_yolo_v3_Exp, 0);
-
-        NodeID id_detector_yolo_v3_mul_2 = _graph.add_node<EltwiseLayerNode>(
-                                               descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Mul,
-            QuantizationInfo() });
-        INode *node_detector_yolo_v3_mul_2 = _graph.node(id_detector_yolo_v3_mul_2);
-        node_detector_yolo_v3_mul_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_2", target });
-        _graph.add_connection(id_detector_yolo_v3_Exp, 0, id_detector_yolo_v3_mul_2, 0);
-        _graph.add_connection(id_detector_yolo_v3_mul_1, 0, id_detector_yolo_v3_mul_2, 1);
-
-        NodeID id_detector_yolo_v3_Sigmoid_2 = _graph.add_node<ActivationLayerNode>(
-                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
-        INode *node_detector_yolo_v3_Sigmoid_2 = _graph.node(id_detector_yolo_v3_Sigmoid_2);
-        node_detector_yolo_v3_Sigmoid_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_2", target });
-        _graph.add_connection(id_detector_yolo_v3_split, 3, id_detector_yolo_v3_Sigmoid_2, 0);
-
-        NodeID id_detector_yolo_v3_concat_2 = _graph.add_node<ConcatenateLayerNode>(
-                                                  4,
-                                                  descriptors::ConcatLayerDescriptor{ x_dim });
-        INode *node_detector_yolo_v3_concat_2 = _graph.node(id_detector_yolo_v3_concat_2);
-        node_detector_yolo_v3_concat_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_concat_2", target });
-        _graph.add_connection(id_detector_yolo_v3_mul, 0, id_detector_yolo_v3_concat_2, 0);
-        _graph.add_connection(id_detector_yolo_v3_mul_2, 0, id_detector_yolo_v3_concat_2, 1);
-        _graph.add_connection(id_detector_yolo_v3_Sigmoid_1, 0, id_detector_yolo_v3_concat_2, 2);
-        _graph.add_connection(id_detector_yolo_v3_Sigmoid_2, 0, id_detector_yolo_v3_concat_2, 3);
-
-        NodeID id_detector_yolo_v3_concat_9 = _graph.add_node<ConcatenateLayerNode>(
-                                                  3,
-                                                  descriptors::ConcatLayerDescriptor{ y_dim });
-        INode *node_detector_yolo_v3_concat_9 = _graph.node(id_detector_yolo_v3_concat_9);
-        node_detector_yolo_v3_concat_9->set_common_node_parameters(NodeParams{ "detector_yolo_v3_concat_9", target });
-        _graph.add_connection(id_detector_yolo_v3_concat_2, 0, id_detector_yolo_v3_concat_9, 0);
-        _graph.add_connection(id_detector_yolo_v3_concat_5, 0, id_detector_yolo_v3_concat_9, 1);
-        _graph.add_connection(id_detector_yolo_v3_concat_8, 0, id_detector_yolo_v3_concat_9, 2);
-
-        NodeID id_split = _graph.add_node<SplitLayerNode>(
-                              5,
-                              0,
-                              std::vector<int> { 1, 1, 1, 1, -1 });
-        INode *node_split = _graph.node(id_split);
-        node_split->set_common_node_parameters(NodeParams{ "split", target });
-        _graph.add_connection(id_detector_yolo_v3_concat_9, 0, id_split, 0);
-
-        NodeID id_truediv = _graph.add_node<EltwiseLayerNode>(
-                                descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Mul,
-            QuantizationInfo() });
-        INode *node_truediv = _graph.node(id_truediv);
-        node_truediv->set_common_node_parameters(NodeParams{ "truediv", target });
-        _graph.add_connection(id_split, 2, id_truediv, 0);
-        _graph.add_connection(id_ConstantFolding_truediv_recip, 0, id_truediv, 1);
-
-        NodeID id_sub = _graph.add_node<EltwiseLayerNode>(
-                            descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Sub,
-            QuantizationInfo() });
-        INode *node_sub = _graph.node(id_sub);
-        node_sub->set_common_node_parameters(NodeParams{ "sub", target });
-        _graph.add_connection(id_split, 0, id_sub, 0);
-        _graph.add_connection(id_truediv, 0, id_sub, 1);
-
-        NodeID id_add = _graph.add_node<EltwiseLayerNode>(
-                            descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Add,
-            QuantizationInfo() });
-        INode *node_add = _graph.node(id_add);
-        node_add->set_common_node_parameters(NodeParams{ "add", target });
-        _graph.add_connection(id_split, 0, id_add, 0);
-        _graph.add_connection(id_truediv, 0, id_add, 1);
-
-        NodeID id_truediv_1 = _graph.add_node<EltwiseLayerNode>(
-                                  descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Mul,
-            QuantizationInfo() });
-        INode *node_truediv_1 = _graph.node(id_truediv_1);
-        node_truediv_1->set_common_node_parameters(NodeParams{ "truediv_1", target });
-        _graph.add_connection(id_split, 3, id_truediv_1, 0);
-        _graph.add_connection(id_ConstantFolding_truediv_1_recip, 0, id_truediv_1, 1);
-
-        NodeID id_sub_1 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Sub,
-            QuantizationInfo() });
-        INode *node_sub_1 = _graph.node(id_sub_1);
-        node_sub_1->set_common_node_parameters(NodeParams{ "sub_1", target });
-        _graph.add_connection(id_split, 1, id_sub_1, 0);
-        _graph.add_connection(id_truediv_1, 0, id_sub_1, 1);
-
-        NodeID id_add_1 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor
-        {
-            EltwiseOperation::Add,
-            QuantizationInfo() });
-        INode *node_add_1 = _graph.node(id_add_1);
-        node_add_1->set_common_node_parameters(NodeParams{ "add_1", target });
-        _graph.add_connection(id_split, 1, id_add_1, 0);
-        _graph.add_connection(id_truediv_1, 0, id_add_1, 1);
-
-        NodeID id_output_boxes = _graph.add_node<ConcatenateLayerNode>(
-                                     5,
-                                     descriptors::ConcatLayerDescriptor{ x_dim });
-        INode *node_output_boxes = _graph.node(id_output_boxes);
-        node_output_boxes->set_common_node_parameters(NodeParams{ "output_boxes", target });
-        _graph.add_connection(id_sub, 0, id_output_boxes, 0);
-        _graph.add_connection(id_sub_1, 0, id_output_boxes, 1);
-        _graph.add_connection(id_add, 0, id_output_boxes, 2);
-        _graph.add_connection(id_add_1, 0, id_output_boxes, 3);
-        _graph.add_connection(id_split, 4, id_output_boxes, 4);
-
-        NodeID id_output_140640247016360   = _graph.add_node<OutputNode>();
-        INode *node_output_140640247016360 = _graph.node(id_output_140640247016360);
-        node_output_140640247016360->set_common_node_parameters(NodeParams{ "output_140640247016360", target });
-        _graph.add_connection(id_output_boxes, 0, id_output_140640247016360, 0);
-        node_output_140640247016360->input(0)->set_accessor(get_npy_output_accessor(expected_output_filename.value(), TensorShape(85U, 10647U), DataType::F32, data_layout));
-
-        return true;
-    }
-
-    Graph &graph()
-    {
-        return _graph;
-    }
-
-private:
-    Graph _graph;
-};
-class GraphYoloV3OutputDetectorExample : public Example
-{
-public:
-    GraphYoloV3OutputDetectorExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params()
-    {
-        expected_output_filename = cmd_parser.add_option<SimpleOption<std::string>>("expected-output-filename", "");
-        expected_output_filename->set_help("Name of npy file containing the expected output to validate the graph output.");
-    }
-    GraphYoloV3OutputDetectorExample(const GraphYoloV3OutputDetectorExample &) = delete;
-    GraphYoloV3OutputDetectorExample &operator=(const GraphYoloV3OutputDetectorExample &) = delete;
-
-    bool do_setup(int argc, char **argv) override
-    {
-        // Parse arguments
-        cmd_parser.parse(argc, argv);
-        cmd_parser.validate();
-
-        // Consume common parameters
-        common_params = consume_common_graph_parameters(common_opts);
-
-        // Return when help menu is requested
-        if(common_params.help)
-        {
-            cmd_parser.print_help(argv[0]);
-            return false;
-        }
-
-        // Print parameter values
-        std::cout << common_params << std::endl;
-
-        model.setup(common_params, *expected_output_filename);
-
-        GraphConfig config;
-        config.num_threads = common_params.threads;
-        config.use_tuner   = common_params.enable_tuner;
-        config.tuner_mode  = common_params.tuner_mode;
-        config.tuner_file  = common_params.tuner_file;
-
-        context.set_config(config);
-
-        auto pass_manager = create_default_pass_manager(common_params.target, config);
-        manager.finalize_graph(model.graph(), context, pass_manager, common_params.target);
-
-        return true;
-    }
-
-    void do_run() override
-    {
-        manager.execute_graph(model.graph());
-    }
-
-private:
-    CommandLineParser  cmd_parser;
-    CommonGraphOptions common_opts;
-    CommonGraphParams  common_params;
-
-    GraphContext context{};
-    GraphManager manager{};
-
-    GraphYoloV3OutputDetector model{};
-
-    SimpleOption<std::string> *expected_output_filename{ nullptr };
-};
-
-int main(int argc, char **argv)
-{
-    return run_example<GraphYoloV3OutputDetectorExample>(argc, argv);
-}
diff --git a/examples/neon_cartoon_effect.cpp b/examples/neon_cartoon_effect.cpp
index dd33885add..24a689bee9 100644
--- a/examples/neon_cartoon_effect.cpp
+++ b/examples/neon_cartoon_effect.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/examples/neon_scale.cpp b/examples/neon_scale.cpp
index ac9d0620ea..f120ea7f96 100644
--- a/examples/neon_scale.cpp
+++ b/examples/neon_scale.cpp
@@ -60,7 +60,13 @@ class NEONScaleExample : public Example
         dst.allocator()->init(dst_tensor_info);
 
         // Configure Scale function object:
-        scale.configure(&src, &dst, ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED });
+        scale.configure(&src, &dst, ScaleKernelInfo{
+                    InterpolationPolicy::NEAREST_NEIGHBOR,
+                    BorderMode::UNDEFINED,
+                    PixelValue(),
+                    SamplingPolicy::CENTER,
+                    false
+        });
 
         // Allocate all the images
         src.allocator()->allocate();
diff --git a/examples/neoncl_scale_median_gaussian.cpp b/examples/neoncl_scale_median_gaussian.cpp
index df0eb9620f..948aff23bb 100644
--- a/examples/neoncl_scale_median_gaussian.cpp
+++ b/examples/neoncl_scale_median_gaussian.cpp
@@ -26,8 +26,9 @@
 #endif /* ARM_COMPUTE_CL */
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "arm_compute/runtime/CL/functions/CLScale.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 #include "utils/ImageLoader.h"
 #include "utils/Utils.h"
diff --git a/scripts/arm_compute_library_nn_driver.go b/scripts/arm_compute_library_nn_driver.go
index 553503f8bf..9413edf843 100644
--- a/scripts/arm_compute_library_nn_driver.go
+++ b/scripts/arm_compute_library_nn_driver.go
@@ -8,6 +8,7 @@ package arm_compute_library_nn_driver
 import (
     "android/soong/android"
     "android/soong/cc"
+    "strings"
 )
 
 func globalFlags(ctx android.BaseContext) []string {
@@ -21,6 +22,29 @@ func globalFlags(ctx android.BaseContext) []string {
         cppflags = append(cppflags, "-fno-addrsig")
     }
 
+    data_types := strings.Split(ctx.AConfig().GetenvWithDefault("COMPUTE_LIB_DATA_TYPE", "ALL"), ",")
+
+    for _, x := range data_types {
+        if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "QASYMM8" {
+            cppflags = append(cppflags, "-DENABLE_QASYMM8_KERNELS")
+        }
+        if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "QASYMM8_SIGNED" {
+            cppflags = append(cppflags, "-DENABLE_QASYMM8_SIGNED_KERNELS")
+        }
+        if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "QASYMM16" {
+            cppflags = append(cppflags, "-DENABLE_QASYMM16_KERNELS")
+        }
+        if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "QSYMM16" {
+            cppflags = append(cppflags, "-DENABLE_QSYMM16_KERNELS")
+        }
+        if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "FP16" {
+            cppflags = append(cppflags, "-DENABLE_FP16_KERNELS")
+        }
+        if strings.ToUpper(x) == "ALL" || strings.ToUpper(x) == "FP32" {
+            cppflags = append(cppflags, "-DENABLE_FP32_KERNELS")
+        }
+    }
+
     return cppflags
 }
 
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index 5e13aa04b4..ce467f8f55 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -111,11 +111,15 @@ def filter_clang_tidy_lines( lines ):
                ("NEWinogradLayerKernel.cpp" in line and "use '= default' to define a trivial destructor" in line) or
                ("NEGEMMLowpMatrixMultiplyCore.cpp" in line and "constructor does not initialize these fields" in line) or
                ("NEGEMMLowpAssemblyMatrixMultiplyCore" in line and "constructor does not initialize these fields" in line) or
+               ("NEDepthwiseConvolutionLayerNativeKernel" in line and re.search(r"parameter '[^']+' is unused", line)) or
+               ("NEDepthwiseConvolutionAssemblyDispatch" in line and re.search(r"parameter '[^']+' is unused", line)) or
                ("CPUUtils.cpp" in line and "consider replacing 'unsigned long' with 'uint64'" in line) or
                ("CPUUtils.cpp" in line and "parameter 'cpusv' is unused" in line) or
                ("CPUUtils.cpp" in line and "warning: uninitialized record type" in line) or
                ("GCKernelLibrary.cpp" in line and "warning: do not declare C-style arrays" in line) or
                ("Utils.h" in line and "warning: Use of zero-allocated memory" in line) or
+               ("NEDepthwiseConvolutionLayerNativeKernel.cpp" in line and "misc-non-private-member-variables-in-classes" in line) or # This is to prevent false positive, should be reassessed with the newer clang-tidy
+               ("NEDepthwiseConvolutionLayerNativeKernel.cpp" in line and "cppcoreguidelines-pro-type-member-init" in line) or # This is to prevent false positive, should be reassessed with the newer clang-tidy
                "3rdparty" in line):
                 print_context=False
                 continue
diff --git a/scripts/include_functions_kernels.py b/scripts/include_functions_kernels.py
index 074f7949b8..4db47ea754 100755
--- a/scripts/include_functions_kernels.py
+++ b/scripts/include_functions_kernels.py
@@ -3,16 +3,30 @@
 import collections
 import os
 
-Target = collections.namedtuple('Target', 'name prefix')
-
-targets = [Target("NEON", "NE"), Target("CL", "CL"), Target("CPP", "CPP"), Target("GLES_COMPUTE", "GC")]
-
 armcv_path = "arm_compute"
-core_path = armcv_path + "/core/"
-runtime_path = armcv_path + "/runtime/"
+src_path ="src"
+
+Target = collections.namedtuple('Target', 'name prefix basepath')
+
+core_targets = [
+    Target("NEON", "NE", src_path),             # NEON kernels are under src
+    Target("CL", "CL", src_path),               # CL kernels are under src
+    Target("CPP", "CPP", armcv_path),           # CPP kernels are under arm_compute
+    Target("GLES_COMPUTE", "GC", armcv_path)    # GLES kernels are under arm_compute
+    ]
+
+# All functions are under arm_compute
+runtime_targets = [
+    Target("NEON", "NE", armcv_path),
+    Target("CL", "CL", armcv_path),
+    Target("CPP", "CPP", armcv_path),
+    Target("GLES_COMPUTE", "GC", armcv_path)
+    ]
+
+core_path = "/core/"
+runtime_path = "/runtime/"
 include_str = "#include \""
 
-
 def read_file(file):
     with open(file, "r") as f:
         lines = f.readlines()
@@ -43,9 +57,9 @@ def create_include_list(folder):
     return updated_files
 
 
-def include_components(path, header_prefix, folder, subfolders=None):
-    for t in targets:
-        target_path = path +  t.name + "/"
+def include_components(target, path, header_prefix, folder, subfolders=None):
+    for t in target:
+        target_path = t.basepath + path +  t.name + "/"
         components_file = target_path + t.prefix + header_prefix
         if os.path.exists(components_file):
             include_list = create_include_list(target_path + folder)
@@ -60,7 +74,7 @@ def include_components(path, header_prefix, folder, subfolders=None):
 
 if __name__ == "__main__":
     # Include kernels
-    include_components(core_path, "Kernels.h", "kernels", ["arm32", "arm64"])
+    include_components(core_targets, core_path, "Kernels.h", "kernels", ["arm32", "arm64"])
 
     # Include functions
-    include_components(runtime_path, "Functions.h", "functions")
+    include_components(runtime_targets, runtime_path, "Functions.h", "functions")
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index 85c5b27d82..ca2f7d238f 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/AccessWindowAutoPadding.h"
+#include "src/core/AccessWindowAutoPadding.h"
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Window.h"
diff --git a/arm_compute/core/AccessWindowAutoPadding.h b/src/core/AccessWindowAutoPadding.h
similarity index 98%
rename from arm_compute/core/AccessWindowAutoPadding.h
rename to src/core/AccessWindowAutoPadding.h
index 12d65532cb..b8d1508679 100644
--- a/arm_compute/core/AccessWindowAutoPadding.h
+++ b/src/core/AccessWindowAutoPadding.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 10e88b8632..0607011bc5 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/AccessWindowStatic.h"
+#include "src/core/AccessWindowStatic.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorInfo.h"
diff --git a/arm_compute/core/AccessWindowStatic.h b/src/core/AccessWindowStatic.h
similarity index 99%
rename from arm_compute/core/AccessWindowStatic.h
rename to src/core/AccessWindowStatic.h
index 1f2ca1b470..f7d43cbb55 100644
--- a/arm_compute/core/AccessWindowStatic.h
+++ b/src/core/AccessWindowStatic.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index 4c03ca16c7..d8bd4c4de1 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/AccessWindowTranspose.h"
+#include "src/core/AccessWindowTranspose.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorInfo.h"
diff --git a/arm_compute/core/AccessWindowTranspose.h b/src/core/AccessWindowTranspose.h
similarity index 98%
rename from arm_compute/core/AccessWindowTranspose.h
rename to src/core/AccessWindowTranspose.h
index 85709092c3..0306076d6e 100644
--- a/arm_compute/core/AccessWindowTranspose.h
+++ b/src/core/AccessWindowTranspose.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 0b59ec8a71..ae8b879be3 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -108,7 +108,6 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "convolution_separable1x9_static", "convolution9x9.cl" },
     { "convolution_separable9x1_static", "convolution9x9.cl" },
     { "copy_tensor", "copy_tensor.cl" },
-    { "copy_pad_tensor", "copy_tensor.cl" },
     { "copy_plane", "channel_extract.cl" },
     { "copy_planes_3p", "channel_combine.cl" },
     { "copy_to_keypoint", "fast_corners.cl" },
@@ -157,6 +156,8 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "elementwise_operation_SQUARED_DIFF", "elementwise_operation.cl" },
     { "elementwise_operation_POWER", "elementwise_operation.cl" },
     { "elementwise_operation_PRELU", "elementwise_operation.cl" },
+    { "elementwise_operation_AND", "elementwise_operation.cl" },
+    { "elementwise_operation_OR", "elementwise_operation.cl" },
     { "elementwise_operation_ADD_quantized", "elementwise_operation_quantized.cl" },
     { "elementwise_operation_SUB_quantized", "elementwise_operation_quantized.cl" },
     { "elementwise_operation_MAX_quantized", "elementwise_operation_quantized.cl" },
@@ -207,16 +208,16 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "gemm_ma_f32", "gemm.cl" },
     { "gemm_mv", "gemv.cl" },
     { "gemm_mv_quantized", "gemv.cl" },
-    { "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
-    { "gemm_mm_interleaved_transposed_f16_acc32", "gemm.cl" },
-    { "gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl" },
-    { "gemm_mm_interleaved_transposed_f32", "gemm.cl" },
-    { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
-    { "gemm_mm_floating_point", "gemm.cl" },
-    { "gemm_mm_floating_point_f16_bifrost", "gemm.cl" },
-    { "gemm_mm_floating_point_f16_bifrost_acc32", "gemm.cl" },
-    { "gemm_mm_floating_point_f32_bifrost", "gemm.cl" },
-    { "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" },
+    { "gemm_mm_interleaved_transposed_f16", "gemm_v1.cl" },
+    { "gemm_mm_interleaved_transposed_f16_acc32", "gemm_v1.cl" },
+    { "gemm_mm_interleaved_transposed_f16_bifrost", "gemm_v1.cl" },
+    { "gemm_mm_interleaved_transposed_f32", "gemm_v1.cl" },
+    { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm_v1.cl" },
+    { "gemm_mm_floating_point", "gemm_v1.cl" },
+    { "gemm_mm_floating_point_f16_bifrost", "gemm_v1.cl" },
+    { "gemm_mm_floating_point_f16_bifrost_acc32", "gemm_v1.cl" },
+    { "gemm_mm_floating_point_f32_bifrost", "gemm_v1.cl" },
+    { "gemm_mm_floating_point_f32_bifrost_1000", "gemm_v1.cl" },
     { "gemm_mm_native", "gemm.cl" },
     { "gemm_mm_reshaped_lhs_nt_rhs_t", "gemm.cl" },
     { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "gemm.cl" },
@@ -326,8 +327,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "pooling_layer_7", "pooling_layer.cl" },
     { "pooling_layer_MxN_nchw", "pooling_layer.cl" },
     { "pooling_layer_MxN_nhwc", "pooling_layer.cl" },
-    { "pooling_layer_2_nhwc_indices_fp32", "pooling_layer.cl" },
-    { "pooling_layer_2_nhwc_indices_fp16", "pooling_layer.cl" },
+    { "pooling_layer_2x2_nhwc", "pooling_layer.cl" },
     { "pooling_layer_2_nchw_indices_fp32", "pooling_layer.cl" },
     { "pooling_layer_2_nchw_indices_fp16", "pooling_layer.cl" },
     { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
@@ -690,6 +690,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
     {
         "gemm.cl",
 #include "./cl_kernels/gemm.clembed"
+    },
+    {
+        "gemm_v1.cl",
+#include "./cl_kernels/gemm_v1.clembed"
     },
     {
         "gemmlowp.cl",
diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h
new file mode 100644
index 0000000000..282cc96dd8
--- /dev/null
+++ b/src/core/CL/CLKernels.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLKERNELS_H
+#define ARM_COMPUTE_CLKERNELS_H
+
+/* Header regrouping all the CL kernels */
+#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+#include "src/core/CL/kernels/CLAccumulateKernel.h"
+#include "src/core/CL/kernels/CLActivationLayerKernel.h"
+#include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
+#include "src/core/CL/kernels/CLBitwiseAndKernel.h"
+#include "src/core/CL/kernels/CLBitwiseNotKernel.h"
+#include "src/core/CL/kernels/CLBitwiseOrKernel.h"
+#include "src/core/CL/kernels/CLBitwiseXorKernel.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "src/core/CL/kernels/CLBox3x3Kernel.h"
+#include "src/core/CL/kernels/CLCannyEdgeKernel.h"
+#include "src/core/CL/kernels/CLChannelCombineKernel.h"
+#include "src/core/CL/kernels/CLChannelExtractKernel.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
+#include "src/core/CL/kernels/CLCol2ImKernel.h"
+#include "src/core/CL/kernels/CLColorConvertKernel.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
+#include "src/core/CL/kernels/CLConvolutionKernel.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLCropKernel.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
+#include "src/core/CL/kernels/CLDequantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLDerivativeKernel.h"
+#include "src/core/CL/kernels/CLDilateKernel.h"
+#include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+#include "src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "src/core/CL/kernels/CLErodeKernel.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
+#include "src/core/CL/kernels/CLFastCornersKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLFlattenLayerKernel.h"
+#include "src/core/CL/kernels/CLFloorKernel.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
+#include "src/core/CL/kernels/CLGaussian3x3Kernel.h"
+#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
+#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
+#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "src/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLHistogramKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLIntegralImageKernel.h"
+#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
+#include "src/core/CL/kernels/CLLKTrackerKernel.h"
+#include "src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
+#include "src/core/CL/kernels/CLMedian3x3Kernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
+#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
+#include "src/core/CL/kernels/CLNonLinearFilterKernel.h"
+#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
+#include "src/core/CL/kernels/CLPermuteKernel.h"
+#include "src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "src/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
+#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+#include "src/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "src/core/CL/kernels/CLRemapKernel.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
+#include "src/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
+#include "src/core/CL/kernels/CLScaleKernel.h"
+#include "src/core/CL/kernels/CLScharr3x3Kernel.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
+#include "src/core/CL/kernels/CLSobel3x3Kernel.h"
+#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "src/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
+#include "src/core/CL/kernels/CLTableLookupKernel.h"
+#include "src/core/CL/kernels/CLThresholdKernel.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
+#include "src/core/CL/kernels/CLUpsampleLayerKernel.h"
+#include "src/core/CL/kernels/CLWarpAffineKernel.h"
+#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradInputTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+#include "src/core/CL/kernels/CLYOLOLayerKernel.h"
+#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
+
+#endif /* ARM_COMPUTE_CLKERNELS_H */
diff --git a/src/core/CL/CLTracePoint.cpp b/src/core/CL/CLTracePoint.cpp
index 631cb84878..d603f40c26 100644
--- a/src/core/CL/CLTracePoint.cpp
+++ b/src/core/CL/CLTracePoint.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/TracePoint.h"
 
+#include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLDistribution1D.h"
 #include "arm_compute/core/CL/ICLHOG.h"
@@ -30,7 +31,6 @@
 #include "arm_compute/core/CL/ICLMultiHOG.h"
 #include "arm_compute/core/CL/ICLMultiImage.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
 #include "utils/TypePrinter.h"
 
 #include <vector>
diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 5d0cdf7f46..67af240044 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp
@@ -26,12 +26,26 @@
 
 #include "src/core/CL/CLUtils.h"
 
-cl::Image2D arm_compute::create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, cl_channel_type data_type, size_t image_row_pitch)
+cl::Image2D arm_compute::create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch)
 {
+    cl_channel_type cl_data_type;
+
+    switch(data_type)
+    {
+        case DataType::F32:
+            cl_data_type = CL_FLOAT;
+            break;
+        case DataType::F16:
+            cl_data_type = CL_HALF_FLOAT;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not support with OpenCL image2d");
+    }
+
     cl_mem cl_image;
     cl_int err = CL_SUCCESS;
 
-    const cl_image_format format = { CL_RGBA, data_type };
+    const cl_image_format format = { CL_RGBA, cl_data_type };
 
     cl_image_desc desc;
     memset(&desc, 0, sizeof(desc));
diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index 8f1c58bcba..b65d547756 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h
@@ -44,12 +44,12 @@ class TensorShape;
  * @param[in] ctx             cl::Context object
  * @param[in] buffer          cl::Buffer object from which the OpenCL image2d object is created
  * @param[in] shape2d         2D tensor shape
- * @param[in] data_type       cl_channel_type to use. Only supported CL_FLOAT
+ * @param[in] data_type       DataType to use. Only supported: F32,F16
  * @param[in] image_row_pitch Image row pitch (a.k.a. stride Y) to be used in the image2d object
  *
  * @return cl::Image2D object
  */
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, cl_channel_type data_type, size_t image_row_pitch);
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch);
 
 } // arm_compute
 
diff --git a/arm_compute/core/CL/CLValidate.h b/src/core/CL/CLValidate.h
similarity index 96%
rename from arm_compute/core/CL/CLValidate.h
rename to src/core/CL/CLValidate.h
index 3f8b76ba4c..7b5294e452 100644
--- a/arm_compute/core/CL/CLValidate.h
+++ b/src/core/CL/CLValidate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CL_VALIDATE_H
 #define ARM_COMPUTE_CL_VALIDATE_H
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Validate.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/ICLGEMMKernelConfiguration.h b/src/core/CL/ICLGEMMKernelConfiguration.h
similarity index 98%
rename from arm_compute/core/CL/ICLGEMMKernelConfiguration.h
rename to src/core/CL/ICLGEMMKernelConfiguration.h
index 90600efba5..ac0e7ab7ff 100644
--- a/arm_compute/core/CL/ICLGEMMKernelConfiguration.h
+++ b/src/core/CL/ICLGEMMKernelConfiguration.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index be633746a2..2b259bf28a 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,16 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
-#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/Utils.h"
 
 #include <cstddef>
 
diff --git a/arm_compute/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
similarity index 99%
rename from arm_compute/core/CL/ICLKernel.h
rename to src/core/CL/ICLKernel.h
index d4990a1dee..a24cd8c798 100644
--- a/arm_compute/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/IKernel.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/experimental/Types.h"
 
 #include <string>
diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp
index ce95495fff..5d8295bdfe 100644
--- a/src/core/CL/ICLSimple2DKernel.cpp
+++ b/src/core/CL/ICLSimple2DKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/arm_compute/core/CL/ICLSimple2DKernel.h b/src/core/CL/ICLSimple2DKernel.h
similarity index 94%
rename from arm_compute/core/CL/ICLSimple2DKernel.h
rename to src/core/CL/ICLSimple2DKernel.h
index 86561cd562..5246492401 100644
--- a/arm_compute/core/CL/ICLSimple2DKernel.h
+++ b/src/core/CL/ICLSimple2DKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_ICLSIMPLE2DKERNEL_H
 #define ARM_COMPUTE_ICLSIMPLE2DKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
+#include "src/core/CL/ICLSimpleKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp
index 3d08262b5f..fef1a86125 100644
--- a/src/core/CL/ICLSimple3DKernel.cpp
+++ b/src/core/CL/ICLSimple3DKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/ICLSimple3DKernel.h"
+#include "src/core/CL/ICLSimple3DKernel.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/arm_compute/core/CL/ICLSimple3DKernel.h b/src/core/CL/ICLSimple3DKernel.h
similarity index 94%
rename from arm_compute/core/CL/ICLSimple3DKernel.h
rename to src/core/CL/ICLSimple3DKernel.h
index 3b4eaf7350..ff0b274663 100644
--- a/arm_compute/core/CL/ICLSimple3DKernel.h
+++ b/src/core/CL/ICLSimple3DKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_ICLSIMPLE3DKERNEL_H
 #define ARM_COMPUTE_ICLSIMPLE3DKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp
index d2f09a3478..d67fefdf71 100644
--- a/src/core/CL/ICLSimpleKernel.cpp
+++ b/src/core/CL/ICLSimpleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
-
+#include "src/core/CL/ICLSimpleKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/arm_compute/core/CL/ICLSimpleKernel.h b/src/core/CL/ICLSimpleKernel.h
similarity index 97%
rename from arm_compute/core/CL/ICLSimpleKernel.h
rename to src/core/CL/ICLSimpleKernel.h
index 805342f830..b35547a217 100644
--- a/arm_compute/core/CL/ICLSimpleKernel.h
+++ b/src/core/CL/ICLSimpleKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_ICLSIMPLEKERNEL_H
 #define ARM_COMPUTE_ICLSIMPLEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/cl_kernels/activation_float_helpers.h b/src/core/CL/cl_kernels/activation_float_helpers.h
index bedde8349e..91d7197889 100644
--- a/src/core/CL/cl_kernels/activation_float_helpers.h
+++ b/src/core/CL/cl_kernels/activation_float_helpers.h
@@ -31,47 +31,47 @@
 #endif // GPU_ARCH == GPU_ARCH_BIFROST
 
 // Hard-Swish
-#define hard_swish_op(DATA_TYPE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
+#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
 
 // Logistic Activation
-#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
 
 // Hyperbolic Tangent Activation
-#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
 
 // RELU Tangent Activation
-#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
 
 // Bounded RELU Activation
-#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
 
 // Lower Upper Bounded RELU Activation
-#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
 
 // Leaky RELU Activation
-#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
 
 // Soft RELU Activation
-#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
 
 // ELU Activation
-#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
 
 // Absolute Activation
-#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
 
 // Square Activation
-#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
 
 // Square-root Activation
-#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
 
 // Linear Activation
-#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
 
 // Identity Activation
-#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
 
-#define ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
 
-#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL)
+#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index f846cb2764..bc2c99b6c8 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,6 +33,7 @@
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
  * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
  *
@@ -61,23 +62,24 @@ __kernel void activation_layer(
 #endif /* not IN_PLACE */
 )
 {
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
     // Get pixels pointer
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
 #ifdef IN_PLACE
-    Tensor3D output = input;
+    __global uchar *output_addr = input_addr;
 #else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
 #endif /* IN_PLACE */
 
     // Load data
-    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
 
     // Perform activation
-    data = ACTIVATION(ACT, DATA_TYPE, data, A_VAL, B_VAL);
+    data0 = ACTIVATION(ACT, DATA_TYPE, VEC_SIZE, data0, A_VAL, B_VAL);
 
     // Store result
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)output.ptr);
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 
 #endif /* defined(ACT) */
diff --git a/src/core/CL/cl_kernels/activation_layer_quant.cl b/src/core/CL/cl_kernels/activation_layer_quant.cl
index 0481319428..66261019ab 100644
--- a/src/core/CL/cl_kernels/activation_layer_quant.cl
+++ b/src/core/CL/cl_kernels/activation_layer_quant.cl
@@ -36,6 +36,7 @@
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
  * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
  * @note Quantization offsets of the input/output tensors are passed in only if asymmetric with -DO1_VAL= and -DO2_VAL= respectively.
@@ -66,34 +67,35 @@ __kernel void activation_layer_quant_f32(
 #endif /* not IN_PLACE */
 )
 {
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
     // Get pixels pointer
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
 #ifdef IN_PLACE
-    Tensor3D output = input;
+    __global uchar *output_addr = input_addr;
 #else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
 #endif /* IN_PLACE */
 
     // Load data
-    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
 
-    VEC_FLOAT data_flt = CONVERT(data, VEC_FLOAT);
+    VEC_FLOAT data_flt = CONVERT(data0, VEC_FLOAT);
 #if defined(O1_VAL)
     data_flt = round(data_flt - (float)O1_VAL) * ((float)S1_VAL);
 #else  // defined(O1_VAL)
-    data_flt        = round(data_flt) * ((float)S1_VAL);
+    data_flt                    = round(data_flt) * ((float)S1_VAL);
 #endif // defined(O1_VAL)
-    data_flt = ACTIVATION(ACT, float, data_flt, A_VAL, B_VAL);
+    data_flt = ACTIVATION(ACT, float, VEC_SIZE, data_flt, A_VAL, B_VAL);
 
 #if defined(O2_VAL)
-    data = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE);
+    data0 = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE);
 #else  // defined(O2_VAL)
-    data            = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE);
+    data0                       = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE);
 #endif // defined(O2_VAL)
 
     // Store result
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)output.ptr);
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 
 #else // defined(FLOAT_DOMAIN)
@@ -106,6 +108,7 @@ __kernel void activation_layer_quant_f32(
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
  * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
  * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
@@ -137,22 +140,23 @@ __kernel void activation_layer_quant(
 #endif /* not IN_PLACE */
 )
 {
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
     // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
 #ifdef IN_PLACE
-    Tensor3D output = input;
+    __global uchar *output_addr = input_addr;
 #else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
 #endif /* IN_PLACE */
 
     // Load data
-    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
 
-    data = PERFORM_ACTIVATION_QUANT(ACT, data);
+    data0 = PERFORM_ACTIVATION_QUANT(ACT, data0);
 
     // Store result
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)output.ptr);
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif // defined(ACT)
 #endif // defined(FLOAT_DOMAIN)
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index ad27aa386c..89cbe4440e 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -129,7 +129,7 @@ __kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
     res = MUL_OP(gamma_vec, x_bar);
 #else  /* USE_DEFAULT_GAMMA */
     // gamma is equal to 1, no need to perform multiplications
-    res          = x_bar;
+    res                         = x_bar;
 #endif /* USE_DEFAULT_GAMMA */
 
 #ifndef USE_DEFAULT_BETA
@@ -139,7 +139,7 @@ __kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
     res = ADD_OP(res, beta_vec);
 #endif /* USE_DEFAULT_BETA */
 
-    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, res, A_VAL, B_VAL);
+    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
 
     VSTORE(VEC_SIZE)
     (res, 0, (__global DATA_TYPE *)out.ptr);
@@ -198,19 +198,21 @@ __kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
 #endif /* USE_DEFAULT_GAMMA */
                                             float epsilon)
 {
-    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
 #ifdef IN_PLACE
-    Tensor3D out = in;
+    __global uchar *output_addr = input_ptr;
 #else  /* IN_PLACE */
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
 #endif /* IN_PLACE */
-    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
+    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+    __global uchar *var_addr  = var_ptr + var_offset_first_element_in_bytes + x_offs;
 #ifndef USE_DEFAULT_BETA
-    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+    __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
 #endif /* USE_DEFAULT_BETA */
 #ifndef USE_DEFAULT_GAMMA
-    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+    __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
 #endif /* USE_DEFAULT_GAMMA */
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -222,40 +224,37 @@ __kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     x_bar = 0;
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res = 0;
-
-    const int current_slice = get_global_id(0);
+    res0 = 0;
 
-    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
-    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(var.ptr + current_slice * VEC_SIZE * var.stride_x));
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
     denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
 
     // Calculate x bar and store results
-    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * mean.stride_x));
+    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
     numerator = SUB_OP(data, numerator);
     x_bar     = MUL_OP(numerator, denominator);
 
 #ifndef USE_DEFAULT_GAMMA
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(gamma.ptr + current_slice * VEC_SIZE * gamma.stride_x));
+    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
 
-    res = MUL_OP(gamma_vec, x_bar);
+    res0 = MUL_OP(gamma_vec, x_bar);
 #else  /* USE_DEFAULT_GAMMA */
     // gamma is equal to 1, no need to perform multiplications
-    res = x_bar;
+    res0 = x_bar;
 #endif /* USE_DEFAULT_GAMMA */
 
 #ifndef USE_DEFAULT_BETA
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(beta.ptr + current_slice * VEC_SIZE * beta.stride_x));
+    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
     // beta is not zero, hence we need to perform the addition
-    res = ADD_OP(res, beta_vec);
+    res0 = ADD_OP(res0, beta_vec);
 #endif /* USE_DEFAULT_BETA */
 
-    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, res, A_VAL, B_VAL);
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
 
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE *)out.ptr);
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
 
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index 4281e675d7..d2e65408dc 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -23,9 +23,11 @@
  */
 #include "helpers.h"
 
+#if defined(VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
 #define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
 #define VEC_QUANT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 #define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
 #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
@@ -38,38 +40,20 @@ inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset,
 }
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 
-#if defined(DATA_TYPE) && defined(VEC_SIZE)
-#if defined(DEPTH) && defined(ELEMENT_SIZE)
+#if defined(DATA_TYPE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 
+#if defined(DEPTH) && defined(ELEMENT_SIZE)
 #if defined(INPUT1_WIDTH)
 
-#if ELEMENT_SIZE == 1
-#define COND_DATA_TYPE char
-#elif ELEMENT_SIZE == 2
-#define COND_DATA_TYPE short
-#elif ELEMENT_SIZE == 4
-#define COND_DATA_TYPE int
-#else // ELEMENT_SIZE
-#error "Element size not supported"
-#endif // ELEMENT_SIZE
-
-#if VEC_SIZE == 2
-#define SEQ ((int2)(0, 1))
-#elif VEC_SIZE == 4
-#define SEQ ((int4)(0, 1, 2, 3))
-#elif VEC_SIZE == 8
-#define SEQ ((int8)(0, 1, 2, 3, 4, 5, 6, 7))
-#elif VEC_SIZE == 16
-#define SEQ ((int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))
-#else // VEC_SIZE
-#error "Vector size not supported"
-#endif // VEC_SIZE
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define SEQ VEC_OFFS(int, VEC_SIZE)
 
 /** This kernel concatenates two input tensors into the output tensor along the first dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
  * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
- * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
  * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
  *
@@ -103,45 +87,43 @@ inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset,
  * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  src1_pad_right                     Right paddings of the first input tensor in unit of elements
- * @param[in]  src1_pad_left                      Left paddings of the second input tensor in unit of elements
  */
 __kernel void concatenate_width_x2(
     TENSOR4D_DECLARATION(src1),
     TENSOR4D_DECLARATION(src2),
-    TENSOR4D_DECLARATION(dst),
-    uint src1_pad_right,
-    uint src2_pad_left)
+    TENSOR4D_DECLARATION(dst))
 {
-    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
-
     // Calculate input indices
-    const int x  = get_global_id(0) * (int)VEC_SIZE;
+    const int x  = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     const int y  = get_global_id(1);
     const int z  = get_global_id(2) % (int)DEPTH;
     const int w  = get_global_id(2) / (int)DEPTH;
-    const int x1 = min(x, (int)INPUT1_WIDTH + (int)src1_pad_right - (int)VEC_SIZE);
-    const int x2 = max(x - (int)INPUT1_WIDTH, -(int)src2_pad_left);
+    const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
+    const int x2 = max(x - (int)INPUT1_WIDTH, 0);
 
     // Calculate inputs and output addresses
-    const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
-    const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+    const __global uchar *dst_addr  = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
+    const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
 
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+    VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
+    VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
 
 #if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
     src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
     src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1)  && defined(SCALE_IN2) && defined(SCALE_OUT) */
-    const VEC_DATA_TYPE(int, VEC_SIZE) x_coords        = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
-    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values    = select(src2_values, src1_values, cond);
+    const VEC_INT x_coords = SEQ + (VEC_INT)(x);
+
+    // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
+    SELECT_TYPE cond = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH) && ((VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
+    src1_values      = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
+    src2_values      = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
 
-    VSTORE(VEC_SIZE)
-    (values, 0, (__global DATA_TYPE *)dst.ptr);
+    cond                   = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
+    const VEC_TYPE values0 = select(src2_values, src1_values, cond);
+
+    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 
 #if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
@@ -149,7 +131,7 @@ __kernel void concatenate_width_x2(
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
  * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
- * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
  * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
  * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
@@ -205,53 +187,36 @@ __kernel void concatenate_width_x2(
  * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  src1_pad_right                     Right paddings of the first input tensor in unit of elements
- * @param[in]  src2_pad_left                      Left paddings of the second input tensor in unit of elements
- * @param[in]  src2_pad_right                     Right paddings of the second input tensor in unit of elements
- * @param[in]  src3_pad_left                      Left paddings of the third input tensor in unit of elements
- * @param[in]  src3_pad_right                     Right paddings of the third input tensor in unit of elements
- * @param[in]  src4_pad_left                      Left paddings of the fourth input tensor in unit of elements
  */
 __kernel void concatenate_width_x4(
     TENSOR4D_DECLARATION(src1),
     TENSOR4D_DECLARATION(src2),
     TENSOR4D_DECLARATION(src3),
     TENSOR4D_DECLARATION(src4),
-    TENSOR4D_DECLARATION(dst),
-    uint src1_pad_right,
-    uint src2_pad_left,
-    uint src2_pad_right,
-    uint src3_pad_left,
-    uint src3_pad_right,
-    uint src4_pad_left)
+    TENSOR4D_DECLARATION(dst))
 {
-    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
-
     // Calculate input indices
-    const int x = get_global_id(0) * (int)VEC_SIZE;
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     const int y = get_global_id(1);
     const int z = get_global_id(2) % (int)DEPTH;
     const int w = get_global_id(2) / (int)DEPTH;
 
-    const int x1 = min(x, (int)INPUT1_WIDTH + (int)src1_pad_right - (int)VEC_SIZE);
-    const int x2 = min(max(x - (int)INPUT1_WIDTH, -(int)src2_pad_left), (int)INPUT2_WIDTH + (int)src2_pad_right - (int)VEC_SIZE);
-    const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, -(int)src3_pad_left), (int)INPUT3_WIDTH + (int)src3_pad_right - (int)VEC_SIZE);
-    const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, -(int)src4_pad_left);
+    const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
+    const int x2 = min(max(x - (int)INPUT1_WIDTH, 0), (int)INPUT2_WIDTH - (int)VEC_SIZE);
+    const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, 0), (int)INPUT3_WIDTH - (int)VEC_SIZE);
+    const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, 0);
 
     // Calculate inputs and output addresses
-    const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
-    const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
-    const __global uchar *in3_ptr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * (int)src3_stride_x + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
-    const __global uchar *in4_ptr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * (int)src4_stride_x + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr);
+    const __global uchar *dst_addr  = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
+    const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+    const __global uchar *src3_addr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * sizeof(DATA_TYPE) + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
+    const __global uchar *src4_addr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * sizeof(DATA_TYPE) + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
+
+    VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
+    VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
+    VEC_TYPE src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src3_addr);
+    VEC_TYPE src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src4_addr);
 
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
     src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
@@ -260,29 +225,42 @@ __kernel void concatenate_width_x4(
     src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */
 
-    const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
+    const VEC_INT x_coords = SEQ + (VEC_INT)(x);
+
+    SELECT_TYPE cond_in2 = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
+    SELECT_TYPE cond_in3 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH - VEC_SIZE)), SELECT_TYPE);
+    SELECT_TYPE cond_in4 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH - VEC_SIZE)), SELECT_TYPE);
 
-    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in2 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
-    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in3 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
-    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in4 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+    // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
+    src1_values = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
+    src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
+    // Rotate src2/3_values, if values0 is a combination of src2_values and src3_values.
+    src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
+    src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
+    // Rotate src3/4_values, if values0 is a combination of src3_values and src4_values.
+    src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
+    src4_values = select(src4_values, ROTATE(src4_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
 
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    values = select(src2_values, src1_values, cond_in2);
-    values = select(src3_values, values, cond_in3);
-    values = select(src4_values, values, cond_in4);
+    cond_in2 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
+    cond_in3 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH), SELECT_TYPE);
+    cond_in4 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), SELECT_TYPE);
 
-    VSTORE(VEC_SIZE)
-    (values, 0, (__global DATA_TYPE *)dst.ptr);
+    VEC_TYPE values0 = select(src2_values, src1_values, cond_in2);
+    values0          = select(src3_values, values0, cond_in3);
+    values0          = select(src4_values, values0, cond_in4);
+
+    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
 #endif /* defined(INPUT1_WIDTH) */
 #endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
 
-#if defined(WIDTH_OFFSET) && defined(DEPTH)
+#if defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
 /** This kernel concatenates the input tensor into the output tensor along the first dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
  * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
  * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
  *
@@ -312,23 +290,28 @@ __kernel void concatenate_width(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst))
 {
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH);
-    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+    // Calculate input indices
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % (int)DEPTH;
+    const int w = get_global_id(2) / (int)DEPTH;
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + z * src_stride_z + w * src_stride_w;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + w * dst_stride_w;
 
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
 
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    const VEC_QUANT out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-    VSTORE(VEC_SIZE)
-    (out, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
+    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-    VSTORE(VEC_SIZE)
-    (source_values, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 }
 
-#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */
+#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)*/
+
+#if defined(VEC_SIZE_LEFTOVER)
 
 #if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE)
 /** This kernel concatenates the input tensor into the output tensor along the second dimension
@@ -338,6 +321,7 @@ __kernel void concatenate_width(
  * @note Vector sizes supported are 2,4,8 and 16.
  * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
  * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -365,19 +349,20 @@ __kernel void concatenate_height(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst))
 {
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH);
-    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + (get_global_id(2) % DEPTH) * src_stride_z + (get_global_id(
+                                   2) / DEPTH) * src_stride_w;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + (get_global_id(2) % DEPTH) * dst_stride_z + (get_global_id(
+                                   2) / DEPTH) * dst_stride_w;
 
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
 
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    const VEC_QUANT out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-    VSTORE(VEC_SIZE)
-    (out, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y));
+    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-    VSTORE(VEC_SIZE)
-    (source_values, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y));
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 }
 
@@ -387,6 +372,7 @@ __kernel void concatenate_height(
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
  * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -411,17 +397,19 @@ __kernel void concatenate(
     TENSOR3D_DECLARATION(dst),
     int offset)
 {
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
 
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
 
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    source_values = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    source_values0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 
-    VSTORE(VEC_SIZE)
-    (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offset));
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + offset, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */
+#endif /* defined(VEC_SIZE_LEFTOVER) */
+#endif /* defined(DATA_TYPE) */
+#endif /* defined(VEC_SIZE) */
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
index 0592e07511..9c90969827 100644
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ b/src/core/CL/cl_kernels/copy_tensor.cl
@@ -23,62 +23,13 @@
  */
 #include "helpers.h"
 
-#if defined(PAD00) && defined(PAD10) && defined(PAD20) && defined(PAD21) && defined(PAD30) && defined(DATA_TYPE) && defined(VEC_SIZE) // Compile time constants
-
-/** Perform a padded copy of input tensor to the output tensor. Padding values are defined at compile time
- *
- * @attention The following variables must be passed at compile time:
- * -# -DPAD{d}{0,1} = padding before{0} and after{1} dimension d (d < 4)
- * -# -DDEPTH = The third dimension (depth) of the tensor (it is needed only if d == 3)
- * -# -DDATA_TYPE = Input and output datatypes.
- *
- * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void copy_pad_tensor(
-    TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out))
-
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    const int offset_x = PAD00;
-    const int offset_y = PAD10;
-    const int offset_z = PAD20;
-
-#if PAD30 > 0
-    const size_t in_batch    = get_global_id(2) / DEPTH;
-    const int    total_depth = DEPTH + PAD20 + PAD21;
-    const int    offset_w    = PAD30 * total_depth + in_batch * (PAD20 + PAD21);
-#else  // PAD30 == 0
-    const int offset_w = 0;
-#endif // PAD30
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
-
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)tensor3D_offset(&out, offset_x, offset_y, offset_z + offset_w));
-}
-#endif // Compile time constants
-
-#if defined(DATA_TYPE)
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
 /** Performs a copy of input tensor to the output tensor.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DDATA_TYPE        : Input and output datatypes.
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
  *
  * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
@@ -104,25 +55,18 @@ __kernel void copy_tensor(
     Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
-#if defined(VEC_SIZE)
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does then shift access vector to access elements within bounds
-    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)LAST_ACCESSED_X, 0);
+    // Boundary-aware access:
+    // If the there's left-over in width (VEC_SIZE_LEFTOVER > 0):
+    // Shift all accesses other than the first to avoid accessing out of bounds
+    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)VEC_SIZE_LEFTOVER, 0) % VEC_SIZE;
     in.ptr -= shift * in.stride_x;
     out.ptr -= shift * out.stride_x;
-#endif // defined(LAST_ACCESSED_X)
 
     // Load data
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
 
-    // Store result
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)out.ptr);
-#else  // defined(VEC_SIZE)
-    *((__global DATA_TYPE *)(out.ptr)) = *((__global DATA_TYPE *)(in.ptr));
-#endif // defined(VEC_SIZE)
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, DATA_TYPE, (__global DATA_TYPE *)out.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
-#endif // defined(DATA_TYPE)
\ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/depth_convert.cl b/src/core/CL/cl_kernels/depth_convert.cl
index 75e6829cc0..046b26df01 100644
--- a/src/core/CL/cl_kernels/depth_convert.cl
+++ b/src/core/CL/cl_kernels/depth_convert.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,7 @@
  * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -67,24 +68,27 @@ __kernel void convert_depth_down(
     TENSOR3D_DECLARATION(out),
     const int shift)
 {
-    // Get pixels pointer
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *in_addr  = in_ptr + in_offset_first_element_in_bytes + sizeof(DATA_TYPE_IN) * x_offs + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + sizeof(DATA_TYPE_OUT) * x_offs + get_global_id(1) * out_stride_y + get_global_id(2) * out_stride_z;
 
     // Load data
     VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr);
+    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in_addr);
 
 #if defined(IS_DATA_TYPE_QUANTIZED)
     in_data ^= (VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE))0x80;
 #endif // defined(IS_DATA_TYPE_QUANTIZED)
 
 #if defined(IS_DATA_TYPE_FLOAT)
-    VSTORE(VEC_SIZE)
-    (CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    res0 = CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #else  /* defined(IS_DATA_TYPE_FLOAT) */
-    VSTORE(VEC_SIZE)
-    (CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    res0 = CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #endif /* defined(IS_DATA_TYPE_FLOAT) */
 }
 
@@ -93,6 +97,7 @@ __kernel void convert_depth_down(
  * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -117,19 +122,22 @@ __kernel void convert_depth_up(
     TENSOR3D_DECLARATION(out),
     const int shift)
 {
-    // Get pixels pointer
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *in_addr  = in_ptr + in_offset_first_element_in_bytes + sizeof(DATA_TYPE_IN) * x_offs + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + sizeof(DATA_TYPE_OUT) * x_offs + get_global_id(1) * out_stride_y + get_global_id(2) * out_stride_z;
 
     // Load data
     VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr);
+    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in_addr);
 
 #if defined(IS_DATA_TYPE_FLOAT)
-    VSTORE(VEC_SIZE)
-    (CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #else  /* defined(IS_DATA_TYPE_FLOAT) */
-    VSTORE(VEC_SIZE)
-    (CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)) << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)) << shift;
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 #endif /* defined(IS_DATA_TYPE_FLOAT) */
 }
diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index e1f6505df7..81fa01ae99 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl
@@ -370,7 +370,7 @@ __kernel void depthwise_convolution_3x3(
     pixels += (float2)bias;
 #endif //defined(HAS_BIAS)
 
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
 }
 #endif //defined(CONV_STRIDE_X)
 
@@ -568,10 +568,10 @@ __kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
     pixels3 += (float2)bias;
 #endif /* defined(HAS_BIAS) */
 
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
 }
 
 /** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
@@ -678,8 +678,8 @@ __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
     pixels1 += (float2)bias;
 #endif /* defined(HAS_BIAS) */
 
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
 }
 
 #endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
@@ -1085,7 +1085,7 @@ __kernel void depthwise_convolution_3x3_f16(
     pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
 #endif //defined(HAS_BIAS)
 
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);
 }
 #endif // defined(DEPTH_MULTIPLIER)
 #endif // defined(CONV_STRIDE_X)
@@ -1207,10 +1207,10 @@ __kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
     pixels3 += (half4)bias;
 #endif /* defined(HAS_BIAS) */
 
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
 }
 
 /** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
@@ -1311,7 +1311,7 @@ __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
     //3x3 Convolution of elements starting in 0th row
     pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
     //3x3 Convolution of elements starting in 2nd row
-    pixels1                 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
+    pixels1                  = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
 #endif /* DILATION_X==1 && DILATION_Y==1 */
 
 #ifdef HAS_BIAS
@@ -1319,12 +1319,12 @@ __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
     pixels1 += (half4)bias;
 #endif /* defined(HAS_BIAS) */
 
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
 }
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
 
-#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP)
+#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER)
 /** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -1338,6 +1338,7 @@ __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
  * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
  * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
  * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
  *
@@ -1384,23 +1385,25 @@ __kernel void dwc_MxN_native_fp_nhwc(
 #endif // defined(HAS_BIAS)
 )
 {
+    int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0) * sizeof(DATA_TYPE);
+
     int x = get_global_id(0); // channels
     int y = get_global_id(1); // spatial coordinate x
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
 #else                                          // defined(DST_DEPTH)
-    int z                   = get_global_id(2); // spatial coordinate y
+    int z                    = get_global_id(2); // spatial coordinate y
 #endif                                         // defined(DST_DEPTH)
 
-    __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0;
+    __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs;
 
-    __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z;
+    __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;
 
-    __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;
+    __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;
 
 #if defined(HAS_BIAS)
-    __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;
+    __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;
 #endif // defined(HAS_BIAS)
 
 #if defined(DST_DEPTH)
@@ -1412,7 +1415,7 @@ __kernel void dwc_MxN_native_fp_nhwc(
     {
         // Each work-item computes N0x1x1 elements
         VEC_DATA_TYPE(DATA_TYPE, N0)
-        res = 0;
+        res0 = 0;
 
         int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
         int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
@@ -1437,9 +1440,9 @@ __kernel void dwc_MxN_native_fp_nhwc(
                         w = VLOAD(N0)(0, (__global DATA_TYPE *)(w_addr + w_offset));
 
 #if GPU_ARCH == GPU_ARCH_MIDGARD
-                        res += i * w;
+                        res0 += i * w;
 #else  // GPU_ARCH == GPU_ARCH_MIDGARD
-                        res = fma(i, w, res);
+                        res0 = fma(i, w, res0);
 #endif // GPU_ARCH == GPU_ARCH_MIDGARD
                     }
                     x_coord_tmp += DILATION_X;
@@ -1449,13 +1452,12 @@ __kernel void dwc_MxN_native_fp_nhwc(
         }
 
 #if defined(HAS_BIAS)
-        res += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr));
+        res0 += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr));
 #endif // defined(HAS_BIAS)
 
-        res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, res, A_VAL, B_VAL);
+        res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, res0, A_VAL, B_VAL);
 
-        VSTORE(N0)
-        (res, 0, (__global DATA_TYPE *)(d_addr));
+        STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 
         w_addr += sizeof(DATA_TYPE);
         d_addr += sizeof(DATA_TYPE);
@@ -1464,7 +1466,7 @@ __kernel void dwc_MxN_native_fp_nhwc(
 #endif // defined(HAS_BIAS)
     }
 }
-#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP)
+#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER)
 
 #if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
 
@@ -1474,6 +1476,19 @@ __kernel void dwc_MxN_native_fp_nhwc(
 
 #define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 
+#define FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond)                                                                     \
+    ({                                                                                                                                    \
+        basename##0 = select(basename##0, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s0)); \
+        basename##1 = select(basename##1, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s1)); \
+        basename##2 = select(basename##2, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s2)); \
+    })
+
+#define FILL_ZERO_OUT_OF_BOUND_4(data_type, vec_size, basename, cond)                                                                     \
+    ({                                                                                                                                    \
+        FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond);                                                                    \
+        basename##3 = select(basename##3, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s3)); \
+    })
+
 #if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
 
 /** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
@@ -1485,9 +1500,13 @@ __kernel void dwc_MxN_native_fp_nhwc(
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
  * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
  * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
  * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
  * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note In case of biases, -DHAS_BIAS must to be passed at compile
+ * @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)
  *
  * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
  * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
@@ -1526,14 +1545,15 @@ __kernel void dwc_MxN_native_fp_nhwc(
 __kernel void depthwise_convolution_3x3_nhwc(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
+    TENSOR3D_DECLARATION(weights)
 #if defined(HAS_BIAS)
-    VECTOR_DECLARATION(biases),
+    ,
+    VECTOR_DECLARATION(biases)
 #endif /* defined(HAS_BIAS) */
-    int max_offset)
+)
 {
-    int x = get_global_id(0); // channels
-    int y = get_global_id(1); // spatial coordinate x
+    int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+    int y        = get_global_id(1); // spatial coordinate x
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
@@ -1541,90 +1561,89 @@ __kernel void depthwise_convolution_3x3_nhwc(
     int      z               = get_global_id(2); // spatial coordinate y
 #endif                                         // defined(DST_DEPTH)
 
-    Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
 
 #if defined(DST_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
 #else  /* defined(DST_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
 #endif /* defined(DST_DEPTH) */
 
-    int  z_coord  = 0;
-    int4 offset   = 0;
-    int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
+    int3 src_coord_y = (int3)(y * CONV_STRIDE_X - CONV_PAD_LEFT) + (int3)(0, DILATION_X, 2 * DILATION_X);
+    int3 src_coord_z = (int3)(z * CONV_STRIDE_Y - CONV_PAD_TOP) + (int3)(0, DILATION_Y, 2 * DILATION_Y);
 
-    // We compute 2x1x1 [C,W,H] elements
-    VEC_FLOAT acc = 0;
+    int3 src_offset_y = clamp(src_coord_y, (int3)0, (int3)(SRC_DIM_1 - 1));
+    int3 src_offset_z = clamp(src_coord_z, (int3)0, (int3)(SRC_DIM_2 - 1));
+
+    // Use these vectors to check whether the unclamped load would have been out of bounds
+    src_coord_y = (src_offset_y != src_coord_y);
+    src_coord_z = (src_offset_z != src_coord_z);
+
+    src_offset_y *= (int3)src_stride_y;
+    src_offset_z *= (int3)src_stride_z;
+
+    // We compute VEC_SIZEx1x1 [C,W,H] elements
+    VEC_FLOAT acc0 = 0;
 
     // Load weights
-    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));
 
     // Load input values
     // z == 0
-    // Clamp z_coord as for z = 0, it can be negative
-    // z_coord is casted to unsigned int in order to use just a min() operation
-    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
-    z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
-    z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
-    offset  = y_offset + (int4)(z_coord * src_stride_z);
-    offset  = min(offset, (int4)max_offset);
-
-    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));
+    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));
+    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));
+
+    FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s0);
+
+    acc0 = fma(values0, w0, acc0);
+    acc0 = fma(values1, w1, acc0);
+    acc0 = fma(values2, w2, acc0);
 
     // z == 1
-    // z_coord can be only negative for z = 0 so we do not need to clamp it
-    // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
-    z_coord           = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
-    offset            = y_offset + (int4)(z_coord * src_stride_z);
-    VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
+    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
+    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
+
+    FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s1);
+
+    acc0 = fma(values0, w3, acc0);
+    acc0 = fma(values1, w4, acc0);
+    acc0 = fma(values2, w5, acc0);
 
     // z == 2
-    // Offset can be out-of-bound so we need to check if it is greater than max_offset
-    z_coord           = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
-    offset            = y_offset + (int4)(z_coord * src_stride_z);
-    offset            = min(offset, (int4)max_offset);
-    VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-
-    acc = fma(values0, w0, acc);
-    acc = fma(values1, w1, acc);
-    acc = fma(values2, w2, acc);
-
-    acc = fma(values3, w3, acc);
-    acc = fma(values4, w4, acc);
-    acc = fma(values5, w5, acc);
-
-    acc = fma(values6, w6, acc);
-    acc = fma(values7, w7, acc);
-    acc = fma(values8, w8, acc);
+    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
+    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
+    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
+
+    FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s2);
+
+    acc0 = fma(values0, w6, acc0);
+    acc0 = fma(values1, w7, acc0);
+    acc0 = fma(values2, w8, acc0);
 
 #if defined(HAS_BIAS)
-    Vector    biases      = CONVERT_TO_VECTOR_STRUCT(biases);
-    VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
-    acc += bias_values;
+    __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
+    VEC_FLOAT bias_values       = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);
+    acc0 += bias_values;
 #endif // defined(HAS_BIAS)
 
 #if defined(DST_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
 #else  /* defined(DST_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z;
 #endif /* defined(DST_DEPTH) */
 
-    VSTORE(VEC_SIZE)
-    (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr));
+    acc0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);
+    STORE_VECTOR_SELECT(acc, DATA_TYPE, dst_addr, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
 }
 #endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
 
@@ -1641,6 +1660,12 @@ __kernel void depthwise_convolution_3x3_nhwc(
  * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
  * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The size of the output's second dimension must be passed at compile time using -DDST_DIM_1 (e.g. -DDST_DIM_1=64)
+ * @note The size of the output's third dimension must be passed at compile time using -DDST_DIM_2 (e.g. -DDST_DIM_2=32)
+ * @note In case of biases, -DHAS_BIAS must to be passed at compile
+ * @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)
  *
  * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
  * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
@@ -1679,14 +1704,15 @@ __kernel void depthwise_convolution_3x3_nhwc(
 __kernel void depthwise_convolution_3x3_nhwc_stride1(
     TENSOR4D_DECLARATION(src),
     TENSOR4D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
+    TENSOR3D_DECLARATION(weights)
 #if defined(HAS_BIAS)
-    VECTOR_DECLARATION(biases),
+    ,
+    VECTOR_DECLARATION(biases)
 #endif /* defined(HAS_BIAS) */
-    int max_offset)
+)
 {
-    int x = get_global_id(0); // channels
-    int y = get_global_id(1); // spatial coordinate x
+    int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+    int y        = get_global_id(1); // spatial coordinate x
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
@@ -1694,79 +1720,52 @@ __kernel void depthwise_convolution_3x3_nhwc_stride1(
     int             z        = get_global_id(2); // spatial coordinate y
 #endif                                         // defined(DST_DEPTH)
 
-    Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+    __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
 
 #if defined(DST_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
 #else  /* defined(DST_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
 #endif /* defined(DST_DEPTH) */
 
-    int  z_coord  = 0;
-    int4 offset   = 0;
-    int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
+    int4 src_coord_y = (int4)(y * NUM_ROWS_PROCESSED - CONV_PAD_LEFT) + V_OFFS4(int);
+    int4 src_coord_z = (int4)(z * NUM_PLANES_PROCESSED - CONV_PAD_TOP) + V_OFFS4(int);
+
+    int4 src_offset_y = clamp(src_coord_y, (int4)0, (int4)(SRC_DIM_1 - 1));
+    int4 src_offset_z = clamp(src_coord_z, (int4)0, (int4)(SRC_DIM_2 - 1));
+
+    // Use these vectors to check whether the unclamped load would have been out of bounds
+    src_coord_y = (src_offset_y != src_coord_y);
+    src_coord_z = (src_offset_z != src_coord_z);
 
-    // We compute 2x2x2 [C,W,H] elements
+    src_offset_y *= (int4)src_stride_y;
+    src_offset_z *= (int4)src_stride_z;
+
+    // We compute VEC_SIZEx2x2 [C,W,H] elements
     VEC_FLOAT acc0 = 0;
     VEC_FLOAT acc1 = 0;
     VEC_FLOAT acc2 = 0;
     VEC_FLOAT acc3 = 0;
 
     // Load weights
-    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));
 
     // Load input values
     // z == 0
-    // Clamp z_coord as for z = 0, it can be negative
-    // z_coord is casted to unsigned int in order to use just a min() operation
-    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
-    z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
-    z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
-    offset  = y_offset + (int4)(z_coord * src_stride_z);
-    offset  = min(offset, (int4)max_offset);
-
-    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-    VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
-
-    // z == 1
-    // z_coord can be only negative for z = 0 so we do not need to clamp it
-    // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
-    z_coord           = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
-    offset            = y_offset + (int4)(z_coord * src_stride_z);
-    VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-    VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));
+    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));
+    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));
+    VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s3));
 
-    // z == 2
-    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
-    // However offset can be out-of-bound so we need to check if it is greater than max_offset
-    offset += (int4)src_stride_z;
-    offset             = min(offset, (int4)max_offset);
-    VEC_FLOAT values8  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_FLOAT values9  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-    VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
-
-    // z == 3
-    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
-    // However offset can be out-of-bound so we need to check if it is greater than max_offset
-    offset += (int4)src_stride_z;
-    offset             = min(offset, (int4)max_offset);
-    VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
-    VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
-    VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
-    VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
+    FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s0);
 
     acc0 = fma(values0, w0, acc0);
     acc0 = fma(values1, w1, acc0);
@@ -1775,45 +1774,69 @@ __kernel void depthwise_convolution_3x3_nhwc_stride1(
     acc1 = fma(values2, w1, acc1);
     acc1 = fma(values3, w2, acc1);
 
-    acc0 = fma(values4, w3, acc0);
-    acc0 = fma(values5, w4, acc0);
-    acc0 = fma(values6, w5, acc0);
-    acc1 = fma(values5, w3, acc1);
-    acc1 = fma(values6, w4, acc1);
-    acc1 = fma(values7, w5, acc1);
-
-    acc0 = fma(values8, w6, acc0);
-    acc0 = fma(values9, w7, acc0);
-    acc0 = fma(values10, w8, acc0);
-    acc1 = fma(values9, w6, acc1);
-    acc1 = fma(values10, w7, acc1);
-    acc1 = fma(values11, w8, acc1);
-
-    acc2 = fma(values4, w0, acc2);
-    acc2 = fma(values5, w1, acc2);
-    acc2 = fma(values6, w2, acc2);
-    acc3 = fma(values5, w0, acc3);
-    acc3 = fma(values6, w1, acc3);
-    acc3 = fma(values7, w2, acc3);
-
-    acc2 = fma(values8, w3, acc2);
-    acc2 = fma(values9, w4, acc2);
-    acc2 = fma(values10, w5, acc2);
-    acc3 = fma(values9, w3, acc3);
-    acc3 = fma(values10, w4, acc3);
-    acc3 = fma(values11, w5, acc3);
-
-    acc2 = fma(values12, w6, acc2);
-    acc2 = fma(values13, w7, acc2);
-    acc2 = fma(values14, w8, acc2);
-    acc3 = fma(values13, w6, acc3);
-    acc3 = fma(values14, w7, acc3);
-    acc3 = fma(values15, w8, acc3);
+    // z == 1
+    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
+    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
+    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
+    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s3));
+
+    FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s1);
+
+    acc0 = fma(values0, w3, acc0);
+    acc0 = fma(values1, w4, acc0);
+    acc0 = fma(values2, w5, acc0);
+    acc1 = fma(values1, w3, acc1);
+    acc1 = fma(values2, w4, acc1);
+    acc1 = fma(values3, w5, acc1);
+
+    acc2 = fma(values0, w0, acc2);
+    acc2 = fma(values1, w1, acc2);
+    acc2 = fma(values2, w2, acc2);
+    acc3 = fma(values1, w0, acc3);
+    acc3 = fma(values2, w1, acc3);
+    acc3 = fma(values3, w2, acc3);
+
+    // z == 2
+    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
+    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
+    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
+    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s3));
+
+    FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s2);
+
+    acc0 = fma(values0, w6, acc0);
+    acc0 = fma(values1, w7, acc0);
+    acc0 = fma(values2, w8, acc0);
+    acc1 = fma(values1, w6, acc1);
+    acc1 = fma(values2, w7, acc1);
+    acc1 = fma(values3, w8, acc1);
+
+    acc2 = fma(values0, w3, acc2);
+    acc2 = fma(values1, w4, acc2);
+    acc2 = fma(values2, w5, acc2);
+    acc3 = fma(values1, w3, acc3);
+    acc3 = fma(values2, w4, acc3);
+    acc3 = fma(values3, w5, acc3);
+
+    // z == 3
+    values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s0));
+    values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s1));
+    values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s2));
+    values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s3));
+
+    FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s3);
+
+    acc2 = fma(values0, w6, acc2);
+    acc2 = fma(values1, w7, acc2);
+    acc2 = fma(values2, w8, acc2);
+    acc3 = fma(values1, w6, acc3);
+    acc3 = fma(values2, w7, acc3);
+    acc3 = fma(values3, w8, acc3);
 
 #if defined(HAS_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+    __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
 
-    VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
+    VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);
 
     acc0 += bias_values;
     acc1 += bias_values;
@@ -1821,25 +1844,34 @@ __kernel void depthwise_convolution_3x3_nhwc_stride1(
     acc3 += bias_values;
 #endif // defined(HAS_BIAS)
 
+    int2 dst_offset_y = min((int2)(y * NUM_ROWS_PROCESSED) + V_OFFS2(int), (int2)(DST_DIM_1 - 1)) * (int2)dst_stride_y;
+    int  dst_coord_z  = z * NUM_PLANES_PROCESSED;
+
 #if defined(DST_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
-#else  /* defined(DST_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
-#endif /* defined(DST_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z + b * dst_stride_w;
+#else  // defined(DST_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z;
+#endif //  defined(DST_DEPTH)
 
-    VSTORE(VEC_SIZE)
-    (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc0, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-    VSTORE(VEC_SIZE)
-    (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc1, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    /* Store vectors in reverse order along the Y. The Y offsets are calculated so that they are forced to be in bound.
+     * If only the first address is in bound, the Y offset of the second address will be brought back and there will be 2 writes in the same location for the same thread.
+     * Since the last vector to be written is always the valid one for that location, it overwrites the wrong values.
+     */
+    values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc1, A_VAL, B_VAL);
+    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
+
+    values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);
+    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
 
 #if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
-    if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
+    if((dst_coord_z + 1) < DST_DIM_2)
 #endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
     {
-        VSTORE(VEC_SIZE)
-        (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc2, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
-        VSTORE(VEC_SIZE)
-        (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc3, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
+        values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc3, A_VAL, B_VAL);
+        STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
+
+        values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc2, A_VAL, B_VAL);
+        STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
     }
 }
 
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index d4bea4b2e8..95cd44eb78 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -1616,7 +1616,7 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
 
 #endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
 
-#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER)
+#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER)
 /** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
  *
  * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
@@ -1629,6 +1629,7 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
  * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
  * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
  * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
  *
@@ -1685,8 +1686,8 @@ __kernel void dwc_MxN_native_quantized8_nhwc(
 #endif // defined(HAS_BIAS)
 )
 {
-    int x = get_global_id(0); // channels
-    int y = get_global_id(1); // spatial coordinate x
+    int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0);
+    int y      = get_global_id(1); // spatial coordinate x
 #if defined(DST_DEPTH)
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
@@ -1694,19 +1695,19 @@ __kernel void dwc_MxN_native_quantized8_nhwc(
     int z = get_global_id(2); // spatial coordinate y
 #endif                                         // defined(DST_DEPTH)
 
-    __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0;
+    __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
 
-    __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z;
+    __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;
 
-    __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(WEIGHTS_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;
+    __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * sizeof(WEIGHTS_TYPE) * (int)DEPTH_MULTIPLIER;
 
 #if defined(HAS_BIAS)
-    __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
+    __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;
 #endif // defined(HAS_BIAS)
 
 #if defined(PER_CHANNEL_QUANTIZATION)
-    __global uchar *out_mul_addr   = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
-    __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
+    __global uchar *out_mul_addr   = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;
+    __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;
 #endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #if defined(DST_DEPTH)
@@ -1772,10 +1773,10 @@ __kernel void dwc_MxN_native_quantized8_nhwc(
         res += (VEC_INT)OUTPUT_OFFSET;
 
         VEC_TYPE(VEC_SIZE)
-        res1 = CONVERT_SAT(res, VEC_TYPE(VEC_SIZE));
+        res0 = CONVERT_SAT(res, VEC_TYPE(VEC_SIZE));
+        res0 = ACTIVATION_FUNC(res0);
 
-        VSTORE(N0)
-        (ACTIVATION_FUNC(res1), 0, (__global DATA_TYPE *)(d_addr));
+        STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 
 #if DEPTH_MULTIPLIER > 1
         w_addr += sizeof(WEIGHTS_TYPE);
@@ -1790,5 +1791,5 @@ __kernel void dwc_MxN_native_quantized8_nhwc(
     }
 #endif // DEPTH_MULTIPLIER > 1
 }
-#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER)
+#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER)
 #endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE)
diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/elementwise_operation.cl
index 52a3309e96..ea25082a6c 100644
--- a/src/core/CL/cl_kernels/elementwise_operation.cl
+++ b/src/core/CL/cl_kernels/elementwise_operation.cl
@@ -38,12 +38,17 @@
 #define SQUARED_DIFF(x, y) (x - y) * (x - y)
 #define DIV(x, y) (x / y)
 #define POWER(x, y) pow(x, y)
-#define PRELU(x, y) (select(y * x, x, x > (DATA_TYPE_OUT)0))
+#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_OUT)0), SELECT_VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))))
+
+#if defined(VEC_SIZE_OUT) && defined(DATA_TYPE_OUT)
+#define AND(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)) & 1)
+#define OR(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)) & 1)
+#endif // defined(VEC_SIZE_OUT) && defined(DATA_TYPE_OUT)
 
 #define OP_FUN_NAME_STR(op) elementwise_operation_##op
 #define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
 
-#if defined(OP) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
+#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT)
 
 #if defined(ACTIVATION_TYPE)
 #include "activation_float_helpers.h"
@@ -51,11 +56,12 @@
 
 /** This function executes an element-wise operation among two tensors.
  *
- * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_OUT=3. It is defined as the remainder between the input's first dimension and VEC_SIZE_OUT
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
  *
  * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
  * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -87,24 +93,36 @@ __kernel void OP_FUN_NAME(OP)(
     TENSOR3D_DECLARATION(in2),
     TENSOR3D_DECLARATION(out))
 {
+#if VEC_SIZE_IN1 == 1
+    uint in1_x_offs = 0;
+#else  // VEC_SIZE_IN1 == 1
+    uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0);
+#endif // VEC_SIZE_IN1 == 1
+#if VEC_SIZE_IN2 == 1
+    uint in2_x_offs = 0;
+#else  // VEC_SIZE_IN2 == 1
+    uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
+#endif // VEC_SIZE_IN2 == 1
+    uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+
     // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE_IN1) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE_IN2) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
 
     // Load values
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-    in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-    in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+    in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+    in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr)), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT));
 
     // Calculate and store result
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+    res0 = OP(in_a, in_b);
 #if defined(ACTIVATION_TYPE)
-    VSTORE(VEC_SIZE)
-    (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, CONVERT(OP(in_a, in_b), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), A_VAL, B_VAL), 0, (__global DATA_TYPE_OUT *)out.ptr);
-#else  // defined(ACTIVATION_TYPE)
-    VSTORE(VEC_SIZE)
-    (OP(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */
+#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) */
diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
index eb57da828d..a08c3b2d47 100644
--- a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
+++ b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,7 +28,7 @@
 #define MAX(x, y) max((x), (y))
 #define MIN(x, y) min((x), (y))
 #define SQUARED_DIFF(x, y) (x - y) * (x - y)
-#define PRELU(x, y) (select(y * x, x, x > (DATA_TYPE_OUT)0))
+#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE_OUT)0), SELECT_VEC_DATA_TYPE(float, VEC_SIZE_OUT))))
 #define DIV(x, y) (x / y)
 
 #define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
@@ -37,25 +37,27 @@
 #define OP_FUN_NAME_STR(op) elementwise_operation_##op##_quantized
 #define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
 
-#if defined(OP) && defined(VEC_SIZE) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT)
+#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT)
 
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
 
 /** This function executes an element-wise operation among two tensors.
  *
- * @attention The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
- * @attention The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
- * @attention The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
- * @attention The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
- * @attention The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
- * @attention The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
- * @attention For QSYMM16 operations OFFSET_IN1, OFFSET_IN2 and OFFSET_OUT must be set to zero
- * @attention The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
+ * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note In case of broadcasting along the X dimension the proper preprocessor argument should be passed depending on the input (e.g. -DIS_IN1_X_BROADCASTING, -DIS_IN2_X_BROADCASTING)
+ * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ * @note For QSYMM16 operations OFFSET_IN1, OFFSET_IN2 and OFFSET_OUT must be set to zero
+ * @note The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
  *
  * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QSYMM16
  * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -87,13 +89,25 @@ __kernel void OP_FUN_NAME(OP)(
     TENSOR3D_DECLARATION(in2),
     TENSOR3D_DECLARATION(out))
 {
+#if VEC_SIZE_IN1 == 1
+    uint in1_x_offs = 0;
+#else  // VEC_SIZE_IN1 == 1
+    uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0);
+#endif // VEC_SIZE_IN1 == 1
+#if VEC_SIZE_IN2 == 1
+    uint in2_x_offs = 0;
+#else  // VEC_SIZE_IN2 == 1
+    uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
+#endif // VEC_SIZE_IN2 == 1
+    uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+
     // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE_OUT) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
 
-    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in1.ptr), VEC_INT);
-    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_OUT *)in2.ptr), VEC_INT);
+    VEC_INT in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT);
+    VEC_INT in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT);
 
     in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
     in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
@@ -101,10 +115,9 @@ __kernel void OP_FUN_NAME(OP)(
     const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
     const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
     const VEC_FLOAT qresf32 = OP(in1f32, in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
-    const VEC_TYPE  res     = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
+    const VEC_TYPE  res0    = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
 
     // Store result
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) */
+#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) */
diff --git a/src/core/CL/cl_kernels/elementwise_unary.cl b/src/core/CL/cl_kernels/elementwise_unary.cl
index 3e557c0550..63594aea83 100644
--- a/src/core/CL/cl_kernels/elementwise_unary.cl
+++ b/src/core/CL/cl_kernels/elementwise_unary.cl
@@ -41,9 +41,11 @@
 // Calculate round (Cannot use round function as it rounds halfway cases away from zero).
 #if defined(VEC_SIZE)
 #define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, VEC_DATA_TYPE(int, VEC_SIZE), rte), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
-#else // defined(VEC_SIZE
+#define logical_not_op(input) CONVERT((!input) & 0x1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
+#else // defined(VEC_SIZE)
 #define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, int, rte), DATA_TYPE)
-#endif // defined(VEC_SIZE
+#define logical_not_op(input) ((!input) & 0x1)
+#endif // defined(VEC_SIZE)
 
 /** Applies element wise unary operator in a tensor.
  *
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 4ad22ec830..b6afb85aa4 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -1121,7 +1121,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
 
 #if defined(REINTERPRET_INPUT_AS_3D)
     // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply lhs_stride_z by DEPTH_GEMM3D
@@ -1227,7 +1227,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
 #if defined(REINTERPRET_OUTPUT_AS_3D)
 
     // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -1275,14 +1275,14 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
 #endif // defined(BETA)
 
 #if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
     const bool cond_y = y == 0;
     const bool cond_x = ((x + 1) * N0 >= N);
 
     // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -1418,7 +1418,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
 
 #if defined(REINTERPRET_INPUT_AS_3D)
     // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply lhs_stride_z by DEPTH_GEMM3D
@@ -1573,7 +1573,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #if defined(REINTERPRET_OUTPUT_AS_3D)
 
     // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -1621,14 +1621,14 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #endif // defined(BETA)
 
 #if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
     const bool cond_y = y == 0;
     const bool cond_x = ((x + 1) * N0 >= N);
 
     // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -1839,7 +1839,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
 #if defined(REINTERPRET_INPUT_AS_3D)
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply lhs_stride_z by DEPTH_GEMM3D
@@ -1969,7 +1969,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -2017,14 +2017,14 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
 #endif // defined(BETA)
 
 #if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
     const bool cond_y = y == 0;
     const bool cond_x = ((x + 1) * N0 >= N);
 
     // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -2157,7 +2157,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #if defined(REINTERPRET_INPUT_AS_3D)
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply lhs_stride_z by DEPTH_GEMM3D
@@ -2278,7 +2278,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -2326,14 +2326,14 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #endif // defined(BETA)
 
 #if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
     const bool cond_y = y == 0;
     const bool cond_x = ((x + 1) * N0 >= N);
 
     // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -2704,7 +2704,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 #if defined(REINTERPRET_OUTPUT_AS_3D)
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
     dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
@@ -2763,9 +2763,9 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 
 #if defined(ACTIVATION_TYPE)
 #if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
 #else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
@@ -2775,9 +2775,9 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
     // Store output block
 #if defined(MIXED_PRECISION)
     CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 #else  // defined(MIXED_PRECISION)
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 #endif // defined(MIXED_PRECISION)
 
 #undef LHS_BLOCK_SIZE
@@ -2974,7 +2974,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #if defined(REINTERPRET_OUTPUT_AS_3D)
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
     dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
@@ -3033,9 +3033,9 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
 
 #if defined(ACTIVATION_TYPE)
 #if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
 #else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
@@ -3045,9 +3045,9 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
     // Store output block
 #if defined(MIXED_PRECISION)
     CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 #else  // defined(MIXED_PRECISION)
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 #endif // defined(MIXED_PRECISION)
 
 #undef LHS_BLOCK_SIZE
@@ -3469,7 +3469,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
 #if defined(REINTERPRET_OUTPUT_AS_3D)
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
     dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
@@ -3527,9 +3527,9 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
 
 #if defined(ACTIVATION_TYPE)
 #if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
 #else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
@@ -3539,9 +3539,9 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
     // Store output block
 #if defined(MIXED_PRECISION)
     CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 #else  // defined(MIXED_PRECISION)
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 #endif // defined(MIXED_PRECISION)
 
 #undef LHS_BLOCK_SIZE
@@ -3837,7 +3837,7 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #if defined(REINTERPRET_OUTPUT_AS_3D)
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
     dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
@@ -3894,9 +3894,9 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 
 #if defined(ACTIVATION_TYPE)
 #if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
 #else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
@@ -3906,9 +3906,9 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
     // Store output block
 #if defined(MIXED_PRECISION)
     CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 #else  // defined(MIXED_PRECISION)
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 #endif // defined(MIXED_PRECISION)
 
 #undef LHS_BLOCK_SIZE
@@ -4120,7 +4120,7 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
 
 #if defined(REINTERPRET_INPUT_AS_3D)
     // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply lhs_stride_z by DEPTH_GEMM3D
@@ -4232,7 +4232,7 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -4280,14 +4280,14 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
 #endif // defined(BETA)
 
 #if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
     const bool cond_y = y == 0;
     const bool cond_x = ((x + 1) * N0 >= N);
 
     // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -4295,3211 +4295,6 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
 }
 #endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
 
-#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
-/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
- * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
-    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
-    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global float *src_end_addr_b = src_addr_b + COLS_B;
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    float4 c0 = 0.0f;
-    float4 c1 = 0.0f;
-    float4 c2 = 0.0f;
-    float4 c3 = 0.0f;
-
-    for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        c0 += (float4)a0.s0 * b0;
-        c1 += (float4)a0.s1 * b0;
-        c2 += (float4)a0.s2 * b0;
-        c3 += (float4)a0.s3 * b0;
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
-        b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
-
-        c0 += (float4)a0.s0 * b0;
-        c1 += (float4)a0.s1 * b0;
-        c2 += (float4)a0.s2 * b0;
-        c3 += (float4)a0.s3 * b0;
-    }
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        c0 += (float4)a0.s0 * b0;
-        c1 += (float4)a0.s1 * b0;
-        c2 += (float4)a0.s2 * b0;
-        c3 += (float4)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, float, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
-
-    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x4 block
-    vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
-    vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
-    vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
-    vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
-}
-
-/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
- * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
-                                                         IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                         IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                         IMAGE_DECLARATION(dst),
-                                                         uint src0_stride_z,
-                                                         uint src1_stride_z,
-#if defined(BETA)
-                                                         uint src2_stride_z,
-#endif //defined(BETA)
-                                                         uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                         ,
-                                                         uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                        )
-{
-    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
-    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
-    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    float4 c0 = 0.0f;
-    float4 c1 = 0.0f;
-    float4 c2 = 0.0f;
-    float4 c3 = 0.0f;
-
-#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))
-
-    int i = 0;
-    for(; i <= (int)(COLS_MTX_B - 4); i += 4)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
-        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
-        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
-        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
-        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-    }
-
-    for(; i < (int)(COLS_MTX_B); ++i)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
-        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, float, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
-
-    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x4 block
-    vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
-    vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
-    vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
-    vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
-}
-
-// Undefine local defines
-#undef COLS_MTX_B
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
- * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
-    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
-    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global half *src_end_addr_b = src_addr_b + COLS_B;
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    half8 c0 = 0.0f;
-    half8 c1 = 0.0f;
-    half8 c2 = 0.0f;
-    half8 c3 = 0.0f;
-
-    for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        c0 += (half8)a0.s0 * b0;
-        c1 += (half8)a0.s1 * b0;
-        c2 += (half8)a0.s2 * b0;
-        c3 += (half8)a0.s3 * b0;
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
-        b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
-
-        c0 += (half8)a0.s0 * b0;
-        c1 += (half8)a0.s1 * b0;
-        c2 += (half8)a0.s2 * b0;
-        c3 += (half8)a0.s3 * b0;
-    }
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        c0 += (half8)a0.s0 * b0;
-        c1 += (half8)a0.s1 * b0;
-        c2 += (half8)a0.s2 * b0;
-        c3 += (half8)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, half, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x8 block
-    vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
-    vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
-    vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
-    vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
-}
-
-/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
- *
- * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
- * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
-                                                       IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                       IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                       IMAGE_DECLARATION(dst),
-                                                       uint src0_stride_z,
-                                                       uint src1_stride_z,
-#if defined(BETA)
-                                                       uint src2_stride_z,
-#endif //defined(BETA)
-                                                       uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                       ,
-                                                       uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                      )
-{
-    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
-    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
-    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global half *src_end_addr_b = src_addr_b + COLS_B;
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    float8 c0 = 0.0f;
-    float8 c1 = 0.0f;
-    float8 c2 = 0.0f;
-    float8 c3 = 0.0f;
-
-    for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = convert_float4(vload4(0, src_addr_a));
-        float8 b0 = convert_float8(vload8(0, src_addr_b));
-
-        c0 += (float8)a0.s0 * b0;
-        c1 += (float8)a0.s1 * b0;
-        c2 += (float8)a0.s2 * b0;
-        c3 += (float8)a0.s3 * b0;
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
-        b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));
-
-        c0 += (float8)a0.s0 * b0;
-        c1 += (float8)a0.s1 * b0;
-        c2 += (float8)a0.s2 * b0;
-        c3 += (float8)a0.s3 * b0;
-    }
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = convert_float4(vload4(0, src_addr_a));
-        float8 b0 = convert_float8(vload8(0, src_addr_b));
-
-        c0 += (float8)a0.s0 * b0;
-        c1 += (float8)a0.s1 * b0;
-        c2 += (float8)a0.s2 * b0;
-        c3 += (float8)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, float, c, ALPHA);
-#endif // defined(ALPHA)
-
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias_f0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-    float8 bias_f1 = convert_float8(bias1);
-    float8 bias_f2 = convert_float8(bias2);
-    float8 bias_f3 = convert_float8(bias3);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias_f);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    half8 c_h0 = convert_half8(c0);
-    half8 c_h1 = convert_half8(c1);
-    half8 c_h2 = convert_half8(c2);
-    half8 c_h3 = convert_half8(c3);
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x8 block
-    vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
-    vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
-    vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
-    vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
-}
-
-/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
- * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
-                                                         IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                         IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                         IMAGE_DECLARATION(dst),
-                                                         uint src0_stride_z,
-                                                         uint src1_stride_z,
-#if defined(BETA)
-                                                         uint src2_stride_z,
-#endif //defined(BETA)
-                                                         uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                         ,
-                                                         uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                        )
-{
-    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
-    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
-    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global half *src_end_addr_b = src_addr_b + COLS_B;
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    half8 c0 = 0.0f;
-    half8 c1 = 0.0f;
-    half8 c2 = 0.0f;
-    half8 c3 = 0.0f;
-
-#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))
-
-    int i = 0;
-    for(; i <= (int)(COLS_MTX_B - 4); i += 4)
-    {
-#if MULT_INTERLEAVE4X4_HEIGHT == 1
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half8 a0 = vload8(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
-        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix B (transposed)
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0 = fma((half8)a0.s4, b0, c0);
-        c1 = fma((half8)a0.s5, b0, c1);
-        c2 = fma((half8)a0.s6, b0, c2);
-        c3 = fma((half8)a0.s7, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload8(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
-        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix B (transposed)
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0 = fma((half8)a0.s4, b0, c0);
-        c1 = fma((half8)a0.s5, b0, c1);
-        c2 = fma((half8)a0.s6, b0, c2);
-        c3 = fma((half8)a0.s7, b0, c3);
-#else  // MULT_INTERLEAVE4X4_HEIGHT == 1
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
-        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
-        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
-        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
-        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
-    }
-
-    for(; i < (int)(COLS_MTX_B); ++i)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
-        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, half, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x8 block
-    vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
-    vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
-    vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
-    vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
-}
-
-// Undefine local defines
-#undef COLS_MTX_B
-
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-
-#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
-
-#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
-#if defined(DATA_TYPE)
-#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
- *
- * @note This OpenCL kernel works with floating point data types (F16/F32)
- * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
- * @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
-                                     IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                     IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                     IMAGE_DECLARATION(dst),
-                                     uint src0_stride_z,
-                                     uint src1_stride_z,
-#if defined(BETA)
-                                     uint src2_stride_z,
-#endif //defined(BETA)
-                                     uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                     ,
-                                     uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                     ,
-                                     uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                    )
-{
-    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(DATA_TYPE);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
-
-    VECTOR_TYPE acc0 = 0.0f;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    VECTOR_TYPE acc1 = 0.0f;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    VECTOR_TYPE acc2 = 0.0f;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    VECTOR_TYPE acc3 = 0.0f;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
-        VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
-
-        // Accumulate
-        acc0 += b0 * (VECTOR_TYPE)a0.s0;
-        acc0 += b1 * (VECTOR_TYPE)a0.s1;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 += b0 * (VECTOR_TYPE)a1.s0;
-        acc1 += b1 * (VECTOR_TYPE)a1.s1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 += b0 * (VECTOR_TYPE)a2.s0;
-        acc2 += b1 * (VECTOR_TYPE)a2.s1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 += b0 * (VECTOR_TYPE)a3.s0;
-        acc3 += b1 * (VECTOR_TYPE)a3.s1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    }
-
-    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
-
-        // Accumulate
-        acc0 += b0 * (VECTOR_TYPE)a0;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 += b0 * (VECTOR_TYPE)a1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 += b0 * (VECTOR_TYPE)a2;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 += b0 * (VECTOR_TYPE)a3;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    }
-
-    int z = get_global_id(2);
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + (get_global_id(1) *
-                                (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
-
-    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store output block
-    STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);
-}
-#endif // defined(DATA_TYPE)
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
- *
- * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
- * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
- * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                 ,
-                                                 uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
-
-    // Compute starting address for matrix A and matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for matrix A
-    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
-
-    // Update address for matrix B
-    src_addr.s1 += idx * sizeof(float);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize accumulators
-    float4 acc0 = 0.0f;
-
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    float4 acc1 = 0.0f;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    float4 acc2 = 0.0f;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    float4 acc3 = 0.0f;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-    // A and B src indices get incremented at the same time.
-    int i = 0;
-    for(; i <= ((int)COLS_A - 4); i += 4)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A and matrix B
-        LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A and matrix B
-        float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
-
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-
-        acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
-
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-
-        acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
-
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        // Load values from matrix A and matrix B
-        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
-
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-
-        acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
-
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-
-        acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
-
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        // Load values from matrix A and matrix B
-        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
-
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-
-        acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
-
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-
-        acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
-
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        // Load values from matrix A and matrix B
-        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
-
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-
-        acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
-
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-
-        acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
-
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        src_addr.s0 += 4 * sizeof(float);
-    }
-
-    for(; i < (int)COLS_A; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0, b0.s3, acc0.s3);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1.s0 = fma(a1, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1, b0.s3, acc1.s3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2.s0 = fma(a2, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2, b0.s3, acc2.s3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3.s0 = fma(a3, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3, b0.s3, acc3.s3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        src_addr.s0 += sizeof(float);
-    }
-
-    int z = get_global_id(2);
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
-
-    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) *
-                                (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
-
-    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-}
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
- *
- * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
- * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
- * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
- * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
-                                                      IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                      IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                      IMAGE_DECLARATION(dst),
-                                                      uint src0_stride_z,
-                                                      uint src1_stride_z,
-#if defined(BETA)
-                                                      uint src2_stride_z,
-#endif //defined(BETA)
-                                                      uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                      ,
-                                                      uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                      ,
-                                                      uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                     )
-{
-    // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(float);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize accumulators
-    float2 acc0 = 0.0f;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    float2 acc1 = 0.0f;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    float2 acc2 = 0.0f;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    float2 acc3 = 0.0f;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-    // A and B src indices get incremented at the same time.
-    int i = 0;
-    for(; i <= ((int)COLS_A - 8); i += 8)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
-        acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
-        acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
-        acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
-        acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
-        acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
-        acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
-        acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
-
-        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
-        acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
-        acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
-        acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
-        acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
-        acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
-        acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
-        acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
-
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if defined(REINTERPRET_INPUT_AS_3D)
-        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-        acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
-        acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
-        acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
-        acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
-        acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
-        acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
-        acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
-        acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
-
-        acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
-        acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
-        acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
-        acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
-        acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
-        acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
-        acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
-        acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if defined(REINTERPRET_INPUT_AS_3D)
-        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-        acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
-        acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
-        acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
-        acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
-        acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
-        acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
-        acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
-        acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
-
-        acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
-        acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
-        acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
-        acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
-        acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
-        acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
-        acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
-        acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if defined(REINTERPRET_INPUT_AS_3D)
-        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-        acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
-        acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
-        acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
-        acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
-        acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
-        acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
-        acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
-        acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
-
-        acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
-        acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
-        acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
-        acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
-        acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
-        acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
-        acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
-        acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        src_addr.s0 += sizeof(float) * 8;
-    }
-    // float size increment
-    for(; i < (int)COLS_A; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0, b0.s1, acc0.s1);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1.s0 = fma(a1, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1, b0.s1, acc1.s1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2.s0 = fma(a2, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2, b0.s1, acc2.s1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3.s0 = fma(a3, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3, b0.s1, acc3.s1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        src_addr.s0 += sizeof(float);
-    }
-
-    int z = get_global_id(2);
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
-
-    LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) *
-                                (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
-
-    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-}
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
- *
- * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
- * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
- * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
-                                                       IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                       IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                       IMAGE_DECLARATION(dst),
-                                                       uint src0_stride_z,
-                                                       uint src1_stride_z,
-#if defined(BETA)
-                                                       uint src2_stride_z,
-#endif //defined(BETA)
-                                                       uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                       ,
-                                                       uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                       ,
-                                                       uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                      )
-{
-    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(half);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    float8 acc0 = 0.0h;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    float8 acc1 = 0.0h;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    float8 acc2 = 0.0h;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    float8 acc3 = 0.0h;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-    int i = 0;
-    for(; i <= ((int)COLS_A - 4); i += 4)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-
-        // Accumulate
-        acc0 = fma(b0, (float8)a0.s0, acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 = fma(b0, (float8)a1.s0, acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 = fma(b0, (float8)a2.s0, acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 = fma(b0, (float8)a3.s0, acc3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (float8)a0.s1, acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 = fma(b0, (float8)a1.s1, acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 = fma(b0, (float8)a2.s1, acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 = fma(b0, (float8)a3.s1, acc3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (float8)a0.s2, acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 = fma(b0, (float8)a1.s2, acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 = fma(b0, (float8)a2.s2, acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 = fma(b0, (float8)a3.s2, acc3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (float8)a0.s3, acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 = fma(b0, (float8)a1.s3, acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 = fma(b0, (float8)a2.s3, acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 = fma(b0, (float8)a3.s3, acc3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        src_addr.s0 += 4 * sizeof(half);
-    }
-
-    for(; i < (int)COLS_A; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-
-        src_addr += (int2)(sizeof(half), src1_stride_y);
-
-        // Accumulate
-        acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
-#endif                                    // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
-#endif                                    // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
-#endif                                    // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    }
-
-    int z = get_global_id(2);
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
-#endif // defined(ALPHA)
-
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *
-                                (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
-
-    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    float8 bias_f1 = convert_float8(bias1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    float8 bias_f2 = convert_float8(bias2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    float8 bias_f3 = convert_float8(bias3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    half8 acc_h0 = convert_half8(acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    half8 acc_h1 = convert_half8(acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    half8 acc_h2 = convert_half8(acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    half8 acc_h3 = convert_half8(acc3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);
-}
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
- *
- * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
- * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
- * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                 ,
-                                                 uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(half);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    half8 acc0 = 0.0h;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    half8 acc1 = 0.0h;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    half8 acc2 = 0.0h;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    half8 acc3 = 0.0h;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-    int i = 0;
-    for(; i <= ((int)COLS_A - 4); i += 4)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Accumulate
-        acc0 = fma(b0, (half8)a0.s0, acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 = fma(b0, (half8)a1.s0, acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 = fma(b0, (half8)a2.s0, acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 = fma(b0, (half8)a3.s0, acc3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (half8)a0.s1, acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 = fma(b0, (half8)a1.s1, acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 = fma(b0, (half8)a2.s1, acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 = fma(b0, (half8)a3.s1, acc3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (half8)a0.s2, acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 = fma(b0, (half8)a1.s2, acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 = fma(b0, (half8)a2.s2, acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 = fma(b0, (half8)a3.s2, acc3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (half8)a0.s3, acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 = fma(b0, (half8)a1.s3, acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 = fma(b0, (half8)a2.s3, acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 = fma(b0, (half8)a3.s3, acc3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        src_addr.s0 += 4 * sizeof(half);
-    }
-
-    for(; i < (int)COLS_A; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-
-        src_addr += (int2)(sizeof(half), src1_stride_y);
-
-        // Accumulate
-        acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
-#endif                                   // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
-#endif                                   // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
-#endif                                   // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    }
-
-    int z = get_global_id(2);
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *
-                                (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;
-
-    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);
-}
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-
-#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
-
 #if defined(BETA)
 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
  *
diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
index 6f6edc1bcf..54d38655a4 100644
--- a/src/core/CL/cl_kernels/gemm_helpers.h
+++ b/src/core/CL/cl_kernels/gemm_helpers.h
@@ -624,49 +624,49 @@
  * @{
  */
 #define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
-    Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
     Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
     Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
     Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
     Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
     Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
     Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
     Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
     Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
     Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
     Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
     Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
     Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
     Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
     Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
 #define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
+    Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                               \
     Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
     Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
@@ -708,449 +708,6 @@
 #define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
 /** @} */ // end of group CALCULATE_Z_OFFSET
 
-/** Store the 0 to (n-1)th rows of the given variables
- * @name STORE_ROW_n
- *
- * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
- * @param[in] DATA_TYPE The data type of the vectors
- * @param[in] BASENAME  The basename of the variables
- * @param[in] PTR       The base pointer
- * @param[in] STRIDE_Y  The stride value in y-axis direction
- * @param[in] Z         The offset in z-axis direction
- * @{
- */
-#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    VSTORE(N0)                                                 \
-    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
-
-#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                 \
-    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
-
-#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                 \
-    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
-
-#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                 \
-    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
-
-#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                 \
-    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
-
-#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                 \
-    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
-
-#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                 \
-    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
-
-#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                 \
-    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
-
-#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                 \
-    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
-
-#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
-    VSTORE(N0)                                                  \
-    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
-
-#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                  \
-    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
-
-#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                  \
-    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
-
-#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                  \
-    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
-
-#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                  \
-    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
-
-#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                  \
-    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
-
-#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                  \
-    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
-/** @} */ // end of groupd STORE_ROW_n
-
-/** Partially store the 0 to (n-1)th rows of the given variables
- * @name STORE_ROW_PARTIAL_n
- * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0
- *
- * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
- *
- * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
- * @param[in] STORE_N0  The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0
- * @param[in] DATA_TYPE The data type of the vectors
- * @param[in] BASENAME  The basename of the variables
- * @param[in] PTR       The base pointer
- * @param[in] STRIDE_Y  The stride value in y-axis direction
- * @param[in] Z         The offset in z-axis direction
- * @{
- */
-#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
-    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
-
-#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
-    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
-
-#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
-    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
-
-#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
-    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
-
-#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
-    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
-
-#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
-    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
-
-#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
-    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
-
-#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
-    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
-
-#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
-    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
-
-#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
-    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
-
-#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
-    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
-
-#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
-    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
-
-#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
-    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
-
-#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
-    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
-
-#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
-    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
-
-#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
-    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
-/** @} */ // end of groupd STORE_ROW_PARTIAL_n
-
-/** Convert and store the 0th to (n-1)th rows of the given variables
- * @name CONVERT_STORE_ROW_n
- *
- * @param[in] N0        The size of the vectors
- * @param[in] DATA_TYPE The data type of the vectors
- * @param[in] BASENAME  The basename of the variables
- * @param[in] PTR       The base pointer
- * @param[in] STRIDE_Y  The stride value in y-axis direction
- * @param[in] Z         The offset in z-axis direction
- * @{
- */
-#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    VSTORE(N0)                                                         \
-    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
-
-#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                         \
-    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
-
-#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                         \
-    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
-
-#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                         \
-    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
-
-#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                         \
-    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
-
-#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                         \
-    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
-
-#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                         \
-    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
-
-#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                         \
-    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
-
-#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                         \
-    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
-
-#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    VSTORE(N0)                                                     \
-    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
-
-#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                          \
-    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
-
-#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                          \
-    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
-
-#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                          \
-    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
-
-#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                          \
-    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
-
-#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                          \
-    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
-
-#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
-    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
-    VSTORE(N0)                                                          \
-    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
-
-/** @} */ // end of groupd CONVERT_STORE_ROW_n
-
-/** Store a block of the given size M0xN0
- * @name STORE_BLOCK
- *
- * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
- * The data to store is expected to have consecutive names for each row.
- * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
- * The Z offset is expected to have consecutive names.
- * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
- *
- * @param[in] M0        The number of rows to store
- * @param[in] N0        The size of each vector
- * @param[in] DATA_TYPE The data type of the vectors
- * @param[in] BASENAME  The basename of the variables
- * @param[in] PTR       The base pointer
- * @param[in] STRIDE_Y  The stride value in y-axis direction
- * @param[in] Z         The offset in z-axis direction
- * @{
- */
-#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-/** @} */ // end of group STORE_BLOCK
-
-/** Partially store a block of the given size STORE_M0xSTORE_N0
- * @name STORE_BLOCK_PARTIAL
- *
- * @note The vector width @p N0 is also required for correct partial storing behaviour.
- * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
- *
- * The data to store is expected to have consecutive names for each row.
- * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2.
- * The Z offset is expected to have consecutive names.
- * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
- *
- * @param[in] STORE_M0  The number of rows to store. Supported: 1-16
- * @param[in] STORE_N0  The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0
- * @param[in] N0        The size of each vector. Supported: 1, 2, 3, 4, 8, 16
- * @param[in] DATA_TYPE The data type of the vectors
- * @param[in] BASENAME  The basename of the variables
- * @param[in] PTR       The base pointer
- * @param[in] STRIDE_Y  The stride value in y-axis direction
- * @param[in] Z         The offset in z-axis direction
- * @{
- */
-#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-/** Store a block that can be partial in both x and y dimensions
- *
- * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
- *
- * The data to store is expected to have consecutive names for each row.
- * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
- * The Z offset is expected to have consecutive names.
- * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
- *
- * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
- * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
- * @param[in] DATA_TYPE        The data type of the vectors
- * @param[in] BASENAME         The basename of the variables
- * @param[in] PTR              The base pointer
- * @param[in] STRIDE_Y         The stride value in y-axis direction
- * @param[in] Z                The offset in z-axis direction
- * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
- * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
- * @param[in] N                Total number of columns. Used to detect if current block is at the boundary in x.
- * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
- * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
- */
-#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                               \
-    {                                                                                                                                        \
-        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                              \
-    }                                                                                                                                        \
-    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                           \
-    {                                                                                                                                        \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                \
-    }                                                                                                                                        \
-    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                           \
-    {                                                                                                                                        \
-        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                \
-    }                                                                                                                                        \
-    else                                                                                                                                     \
-    {                                                                                                                                        \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                  \
-    }
-/** Store a block that can only be partial in x but not y.
- *
- * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
- *
- * The data to store is expected to have consecutive names for each row.
- * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
- * The Z offset is expected to have consecutive names.
- * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
- *
- * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
- * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
- * @param[in] DATA_TYPE        The data type of the vectors
- * @param[in] BASENAME         The basename of the variables
- * @param[in] PTR              The base pointer
- * @param[in] STRIDE_Y         The stride value in y-axis direction
- * @param[in] Z                The offset in z-axis direction
- * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
- * @param[in] N                Total number of columns. Used to detect if current block is at the boundary in x.
- * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
- */
-#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X))                                                                                            \
-    {                                                                                                        \
-        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                              \
-    }                                                                                                        \
-    else                                                                                                     \
-    {                                                                                                        \
-        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                \
-    }
-/** Store a block that can only be partial in y but not x.
- *
- * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
- *
- * The data to store is expected to have consecutive names for each row.
- * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
- * The Z offset is expected to have consecutive names.
- * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
- *
- * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
- * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
- * @param[in] DATA_TYPE        The data type of the vectors
- * @param[in] BASENAME         The basename of the variables
- * @param[in] PTR              The base pointer
- * @param[in] STRIDE_Y         The stride value in y-axis direction
- * @param[in] Z                The offset in z-axis direction
- * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
- * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
- */
-#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
-    if(!(PARTIAL_COND_Y))                                                                                         \
-    {                                                                                                     \
-        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                           \
-    }                                                                                                     \
-    else                                                                                                  \
-    {                                                                                                     \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);             \
-    }
-/** @} */ // end of group STORE_BLOCK_PARTIAL
-
-/** Convert and store a block of the given size M0xN0
- * @name CONVERT_STORE_BLOCK
- *
- * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
- * The data to store is expected to have consecutive names for each row.
- * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
- * The Z offset is expected to have consecutive names.
- * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
- *
- * @param[in] M0        The number of rows to store
- * @param[in] N0        The size of each vector
- * @param[in] DATA_TYPE The data type of the vectors
- * @param[in] BASENAME  The basename of the variables
- * @param[in] PTR       The base pointer
- * @param[in] STRIDE_Y  The stride value in y-axis direction
- * @param[in] Z         The offset in z-axis direction
- * @{
- */
-#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-/** @} */ // end of group CONVERT_STORE_BLOCK
-
 /** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
  * @name SCALE_ROW_n
  *
@@ -1550,68 +1107,68 @@
  * @param[in] B_VAL           Additional value required by the activation
  * @{
  */
-#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##0, A_VAL, B_VAL);
+#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##1, A_VAL, B_VAL);
+#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##2, A_VAL, B_VAL);
+#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##3, A_VAL, B_VAL);
+#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##4, A_VAL, B_VAL);
+#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##5, A_VAL, B_VAL);
+#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##6, A_VAL, B_VAL);
+#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##7, A_VAL, B_VAL);
+#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##8, A_VAL, B_VAL);
+#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)      \
-    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##9, A_VAL, B_VAL);
+#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)      \
+    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##A, A_VAL, B_VAL);
+#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##B, A_VAL, B_VAL);
+#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##C, A_VAL, B_VAL);
+#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##D, A_VAL, B_VAL);
+#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##E, A_VAL, B_VAL);
+#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL);
 
-#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
-    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)     \
-    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##F, A_VAL, B_VAL);
+#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
+    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL);
 /** @} */ // end of group ACTIVATION_ROW_n
 
 /** Apply activation to a block (BASENAME)
@@ -1627,8 +1184,8 @@
  * @param[in] B_VAL           Additional value required by the activation
  * @{
  */
-#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
-#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
 /** @} */ // end of group ACTIVATION_BLOCK
 
 /** Apply convert_<data_type> to the given variables
@@ -1732,113 +1289,4 @@
  */
 #define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
 #define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
-/** @} */ // end of group CONVERT_BLOCK
-
-#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-/** Boundary-aware GEMM block store
- * @name STORE_BLOCK_BOUNDARY_AWARE
- * This macro assumes the following schemes to achieve boundary-awareness:
- *  - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
- *  - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
- *  - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
- * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim.
- *
- * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
- * blocks **at the end**.
- * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
- * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
- *
- *  *--x-->                         x == 0                        x == 1
- *  |                  |<------------------------------N-------------------------->|
- *  y                  |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
- *  |     -------------#############################################################
- *  *     |          | |...............................|...........................|
- * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
- *        |          | |...............................|...........................|
- *        M          --#############################################################
- *        |          | |                               |...........................|
- * y == 1 |         M0 |      Non-boundary block       |....Boundary block in x....|
- *        |          | |                               |...........................|
- *        |------------#############################################################
- *
- * Then @p PARTIAL_STORE_M0 = M % M0      and @p PARTIAL_STORE_N0 = N % N0
- *
- * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
- *
- * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
- * and select corresponding store methods such that the boundary detection logic is only added when needed.
- *
- * The data to store is expected to have consecutive names for each row.
- * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
- * The Z offset is expected to have consecutive names.
- * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
- *
- * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
- * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
- * @param[in] DATA_TYPE        The data type of the vectors
- * @param[in] BASENAME         The basename of the variables
- * @param[in] PTR              The base pointer
- * @param[in] STRIDE_Y         The stride value in y-axis direction
- * @param[in] Z                The offset in z-axis direction
- * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
- * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
- * @param[in] N                Total number of columns. Used to detect if current block is at the boundary in x.
- * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
- * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
- * @{
- */
-#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
-// Case1: No partial blocks in either x or y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-
-#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
-// Case2: Partial blocks in y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
-
-#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
-// Case3: Partial blocks in x
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, PARTIAL_COND_X)
-
-#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
-// Case4: Partial blocks in both x and y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X)
-
-#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
-
-#else // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-
-#endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE
-
-#if defined(PARTIAL_STORE_M0)
-/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding
- * @name COMPUTE_M0_START_ROW
- * If there're any partial blocks in y dimension, they are placed at the beginning of the rows.
- * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent
- * blocks in the y dimension to avoid any padding.
- * EG: M0=4, PARTIAL_STORE_M0=1:
- *                  | Non-overlapping | +M0_ROW_SHIFT (Overlapping)
- * block 0 (partial)| start row = 0   | start row = 0
- * block 1 (full)   | start row = 4   | start row = 1
- * block 2 (full)   | start row = 8   | start row = 5
- *
- * @param[in] y                Global id of current block in y.
- * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
- * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
- * @{
- */
-#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
-    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
-#else // defined(PARTIAL_STORE_M0)
-#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
-    ((uint)(y * M0))
-#endif    // defined(PARTIAL_STORE_M0)
-/** @} */ // end of group COMPUTE_M0_START_ROW
+/** @} */ // end of group CONVERT_BLOCK
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemm_v1.cl b/src/core/CL/cl_kernels/gemm_v1.cl
new file mode 100644
index 0000000000..5f8b4f694e
--- /dev/null
+++ b/src/core/CL/cl_kernels/gemm_v1.cl
@@ -0,0 +1,3238 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "repeat.h"
+
+#if defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
+ * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif //defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                )
+{
+    int x = get_global_id(0) / H0;
+    int y = get_global_id(1) / V0;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % V0) * 4;
+    const int offset_row_b = (get_global_id(0) % H0) * 4;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+    // Compute end row address for matrix B
+    __global float *src_end_addr_b = src_addr_b + (src1_stride_y / sizeof(float));
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    float4 c0 = 0.0f;
+    float4 c1 = 0.0f;
+    float4 c2 = 0.0f;
+    float4 c3 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - (int)(8 * H0)); src_addr_a += 8 * V0, src_addr_b += 8 * H0)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c0 += (float4)a0.s0 * b0;
+        c1 += (float4)a0.s1 * b0;
+        c2 += (float4)a0.s2 * b0;
+        c3 += (float4)a0.s3 * b0;
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 4 * V0);
+        b0 = vload4(0, src_addr_b + 4 * H0);
+
+        c0 += (float4)a0.s0 * b0;
+        c1 += (float4)a0.s1 * b0;
+        c2 += (float4)a0.s2 * b0;
+        c3 += (float4)a0.s3 * b0;
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 4 * H0)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c0 += (float4)a0.s0 * b0;
+        c1 += (float4)a0.s1 * b0;
+        c2 += (float4)a0.s2 * b0;
+        c3 += (float4)a0.s3 * b0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x4 block
+    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(4, 4, float, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
+ * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                         IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                         IMAGE_DECLARATION(dst),
+                                                         uint src0_stride_z,
+                                                         uint src1_stride_z,
+#if defined(BETA)
+                                                         uint src2_stride_z,
+#endif //defined(BETA)
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                        )
+{
+    int x = get_global_id(0) / H0;
+    int y = get_global_id(1) / V0;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % V0) * 4;
+    const int offset_row_b = (get_global_id(0) % H0) * 4;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    float4 c0 = 0.0f;
+    float4 c1 = 0.0f;
+    float4 c2 = 0.0f;
+    float4 c3 = 0.0f;
+
+    int i = 0;
+    for(; i <= (int)(K - 4); i += 4)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 4 * H0;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 4 * H0;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 4 * H0;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 4 * H0;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+    }
+
+    for(; i < (int)K; ++i)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 4 * H0;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x4 block
+    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(4, 4, float, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
+ * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif //defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                )
+{
+    int x = get_global_id(0) / H0;
+    int y = get_global_id(1) / V0;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % V0) * 4;
+    const int offset_row_b = (get_global_id(0) % H0) * 8;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+    // Compute end row address for matrix B
+    __global half *src_end_addr_b = src_addr_b + (src1_stride_y / sizeof(half));
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    half8 c0 = 0.0f;
+    half8 c1 = 0.0f;
+    half8 c2 = 0.0f;
+    half8 c3 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - (int)(16 * H0)); src_addr_a += 8 * V0, src_addr_b += 16 * H0)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        c0 += (half8)a0.s0 * b0;
+        c1 += (half8)a0.s1 * b0;
+        c2 += (half8)a0.s2 * b0;
+        c3 += (half8)a0.s3 * b0;
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 4 * V0);
+        b0 = vload8(0, src_addr_b + 8 * H0);
+
+        c0 += (half8)a0.s0 * b0;
+        c1 += (half8)a0.s1 * b0;
+        c2 += (half8)a0.s2 * b0;
+        c3 += (half8)a0.s3 * b0;
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 8 * H0)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        c0 += (half8)a0.s0 * b0;
+        c1 += (half8)a0.s1 * b0;
+        c2 += (half8)a0.s2 * b0;
+        c3 += (half8)a0.s3 * b0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x8 block
+    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
+ *
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
+ * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                       IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                       IMAGE_DECLARATION(dst),
+                                                       uint src0_stride_z,
+                                                       uint src1_stride_z,
+#if defined(BETA)
+                                                       uint src2_stride_z,
+#endif //defined(BETA)
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                      )
+{
+    int x = get_global_id(0) / H0;
+    int y = get_global_id(1) / V0;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % V0) * 4;
+    const int offset_row_b = (get_global_id(0) % H0) * 8;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+    // Compute end row address for matrix B
+    __global half *src_end_addr_b = src_addr_b + (src1_stride_y / sizeof(half));
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    float8 c0 = 0.0f;
+    float8 c1 = 0.0f;
+    float8 c2 = 0.0f;
+    float8 c3 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - (int)(16 * H0)); src_addr_a += 8 * V0, src_addr_b += 16 * H0)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = convert_float4(vload4(0, src_addr_a));
+        float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+        c0 += (float8)a0.s0 * b0;
+        c1 += (float8)a0.s1 * b0;
+        c2 += (float8)a0.s2 * b0;
+        c3 += (float8)a0.s3 * b0;
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = convert_float4(vload4(0, src_addr_a + 4 * V0));
+        b0 = convert_float8(vload8(0, src_addr_b + 8 * H0));
+
+        c0 += (float8)a0.s0 * b0;
+        c1 += (float8)a0.s1 * b0;
+        c2 += (float8)a0.s2 * b0;
+        c3 += (float8)a0.s3 * b0;
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 8 * H0)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = convert_float4(vload4(0, src_addr_a));
+        float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+        c0 += (float8)a0.s0 * b0;
+        c1 += (float8)a0.s1 * b0;
+        c2 += (float8)a0.s2 * b0;
+        c3 += (float8)a0.s3 * b0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+    float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+    float8 bias_f0 = convert_float8(bias0);
+    float8 bias_f1 = convert_float8(bias1);
+    float8 bias_f2 = convert_float8(bias2);
+    float8 bias_f3 = convert_float8(bias3);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    half8 c_h0 = convert_half8(c0);
+    half8 c_h1 = convert_half8(c1);
+    half8 c_h2 = convert_half8(c2);
+    half8 c_h3 = convert_half8(c3);
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x8 block
+    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c_h, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
+ * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                         IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                         IMAGE_DECLARATION(dst),
+                                                         uint src0_stride_z,
+                                                         uint src1_stride_z,
+#if defined(BETA)
+                                                         uint src2_stride_z,
+#endif //defined(BETA)
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                        )
+{
+    int x = get_global_id(0) / H0;
+    int y = get_global_id(1) / V0;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % V0) * 4;
+    const int offset_row_b = (get_global_id(0) % H0) * 8;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    half8 c0 = 0.0f;
+    half8 c1 = 0.0f;
+    half8 c2 = 0.0f;
+    half8 c3 = 0.0f;
+
+    int i = 0;
+    for(; i <= (int)(K - 4); i += 4)
+    {
+#if V0 == 1
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half8 a0 = vload8(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 8 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix B (transposed)
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s4, b0, c0);
+        c1 = fma((half8)a0.s5, b0, c1);
+        c2 = fma((half8)a0.s6, b0, c2);
+        c3 = fma((half8)a0.s7, b0, c3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload8(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 8 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix B (transposed)
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s4, b0, c0);
+        c1 = fma((half8)a0.s5, b0, c1);
+        c2 = fma((half8)a0.s6, b0, c2);
+        c3 = fma((half8)a0.s7, b0, c3);
+#else  // V0 == 1
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+#endif // V0 == 1
+    }
+
+    for(; i < (int)K; ++i)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x8 block
+    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+
+#if defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#if defined(DATA_TYPE)
+#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, N0)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
+ *
+ * @note This OpenCL kernel works with floating point data types (F16/F32)
+ * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
+                                     IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                     IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                     IMAGE_DECLARATION(dst),
+                                     uint src0_stride_z,
+                                     uint src1_stride_z,
+#if defined(BETA)
+                                     uint src2_stride_z,
+#endif //defined(BETA)
+                                     uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                     ,
+                                     uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                     ,
+                                     uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                    )
+{
+    int idx = get_global_id(0) * N0;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(DATA_TYPE);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    int end_row_vec_a = src_addr.s0 + (K * sizeof(DATA_TYPE));
+
+    VECTOR_TYPE acc0 = 0.0f;
+#if M0 > 1
+    VECTOR_TYPE acc1 = 0.0f;
+#endif // M0 > 1
+#if M0 > 2
+    VECTOR_TYPE acc2 = 0.0f;
+#endif // M0 > 2
+#if M0 > 3
+    VECTOR_TYPE acc3 = 0.0f;
+#endif // M0 > 3
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        LOAD_BLOCK(M0, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        VECTOR_TYPE b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+        VECTOR_TYPE b1 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        // Accumulate
+        acc0 += b0 * (VECTOR_TYPE)a0.s0;
+        acc0 += b1 * (VECTOR_TYPE)a0.s1;
+#if M0 > 1
+        acc1 += b0 * (VECTOR_TYPE)a1.s0;
+        acc1 += b1 * (VECTOR_TYPE)a1.s1;
+#endif // M0 > 1
+#if M0 > 2
+        acc2 += b0 * (VECTOR_TYPE)a2.s0;
+        acc2 += b1 * (VECTOR_TYPE)a2.s1;
+#endif // M0 > 2
+#if M0 > 3
+        acc3 += b0 * (VECTOR_TYPE)a3.s0;
+        acc3 += b1 * (VECTOR_TYPE)a3.s1;
+#endif // M0 > 3
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if M0 > 1
+        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // M0 > 1
+#if M0 > 2
+        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // M0 > 2
+#if M0 > 3
+        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // M0 > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        VECTOR_TYPE b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+
+        // Accumulate
+        acc0 += b0 * (VECTOR_TYPE)a0;
+#if M0 > 1
+        acc1 += b0 * (VECTOR_TYPE)a1;
+#endif // M0 > 1
+#if M0 > 2
+        acc2 += b0 * (VECTOR_TYPE)a2;
+#endif // M0 > 2
+#if M0 > 3
+        acc3 += b0 * (VECTOR_TYPE)a3;
+#endif // M0 > 3
+    }
+
+    int z = get_global_id(2);
+
+    // Compute dst address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                               PARTIAL_STORE_M0)
+                               * dst_stride_y);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, acc, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                                PARTIAL_STORE_M0)
+                                * src2_stride_y)
+                                + z * src2_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    const bool cond_y = get_global_id(1) == 0;
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+#endif // defined(DATA_TYPE)
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
+ * @note This kernel processed a fixed number of elements along x: -DN0=4.
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif //defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                 ,
+                                                 uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                )
+{
+    int idx = get_global_id(0) * N0;
+
+    // Compute starting address for matrix A and matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for matrix A
+    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
+
+    // Update address for matrix B
+    src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize accumulators
+    float4 acc0 = 0.0f;
+
+#if M0 > 1
+    float4 acc1 = 0.0f;
+#endif // M0 > 1
+
+#if M0 > 2
+    float4 acc2 = 0.0f;
+#endif // M0 > 2
+
+#if M0 > 3
+    float4 acc3 = 0.0f;
+#endif // M0 > 3
+
+    // A and B src indices get incremented at the same time.
+    int i = 0;
+    for(; i <= ((int)K - 4); i += 4)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A and matrix B
+        LOAD_BLOCK(M0, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A and matrix B
+        float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
+
+#if M0 > 1
+
+        acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
+
+#endif // M0 > 1
+#if M0 > 2
+
+        acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
+
+#endif // M0 > 2
+#if M0 > 3
+
+        acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
+#endif // M0 > 3
+
+        // Load values from matrix A and matrix B
+        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
+
+#if M0 > 1
+
+        acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
+
+#endif // M0 > 1
+#if M0 > 2
+
+        acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
+
+#endif // M0 > 2
+#if M0 > 3
+
+        acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
+#endif // M0 > 3
+
+        // Load values from matrix A and matrix B
+        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
+
+#if M0 > 1
+
+        acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
+
+#endif // M0 > 1
+#if M0 > 2
+
+        acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
+
+#endif // M0 > 2
+#if M0 > 3
+
+        acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
+#endif // M0 > 3
+
+        // Load values from matrix A and matrix B
+        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
+
+#if M0 > 1
+
+        acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
+
+#endif // M0 > 1
+#if M0 > 2
+
+        acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
+
+#endif // M0 > 2
+#if M0 > 3
+
+        acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
+#endif // M0 > 3
+
+        src_addr.s0 += 4 * sizeof(float);
+    }
+
+    for(; i < (int)K; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if M0 > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // M0 > 1
+#if M0 > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // M0 > 2
+#if M0 > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // M0 > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0, b0.s3, acc0.s3);
+#if M0 > 1
+        acc1.s0 = fma(a1, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1, b0.s3, acc1.s3);
+#endif // M0 > 1
+#if M0 > 2
+        acc2.s0 = fma(a2, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2, b0.s3, acc2.s3);
+#endif // M0 > 2
+#if M0 > 3
+        acc3.s0 = fma(a3, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3, b0.s3, acc3.s3);
+#endif // M0 > 3
+
+        src_addr.s0 += sizeof(float);
+    }
+
+    int z = get_global_id(2);
+
+    // Compute dst address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                               PARTIAL_STORE_M0) * dst_stride_y);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                                PARTIAL_STORE_M0)
+                                * src2_stride_y)
+                                + z * src2_stride_z;
+
+    LOAD_BLOCK(M0, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias
+    ADD_BLOCK(M0, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store the output block
+    const bool cond_y = get_global_id(1) == 0;
+    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, 4, float, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
+ * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
+ * @note This kernel processed a fixed number of elements along x: -DN0=2.
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
+                                                      IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                      IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                      IMAGE_DECLARATION(dst),
+                                                      uint src0_stride_z,
+                                                      uint src1_stride_z,
+#if defined(BETA)
+                                                      uint src2_stride_z,
+#endif //defined(BETA)
+                                                      uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                      ,
+                                                      uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                      ,
+                                                      uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                     )
+{
+    // Requires 2 N0, C vect2, A vect4, B (2 vload2) // to fix for M0 > 1
+    int idx = get_global_id(0) * N0;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize accumulators
+    float2 acc0 = 0.0f;
+#if M0 > 1
+    float2 acc1 = 0.0f;
+#endif // M0 > 1
+#if M0 > 2
+    float2 acc2 = 0.0f;
+#endif // M0 > 2
+#if M0 > 3
+    float2 acc3 = 0.0f;
+#endif // M0 > 3
+
+    // A and B src indices get incremented at the same time.
+    int i = 0;
+    for(; i <= ((int)K - 8); i += 8)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+        acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
+        acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
+        acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
+        acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
+        acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
+        acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
+        acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
+
+        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+        acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
+        acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
+        acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
+        acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
+        acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
+        acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
+        acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
+
+#if M0 > 1
+#if defined(REINTERPRET_INPUT_AS_3D)
+        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+        acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
+        acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
+        acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
+        acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
+        acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
+        acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
+        acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
+        acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
+
+        acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
+        acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
+        acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
+        acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
+        acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
+        acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
+        acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
+        acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
+#endif // M0 > 1
+#if M0 > 2
+#if defined(REINTERPRET_INPUT_AS_3D)
+        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+        acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
+        acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
+        acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
+        acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
+        acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
+        acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
+        acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
+        acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
+
+        acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
+        acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
+        acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
+        acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
+        acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
+        acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
+        acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
+        acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
+#endif // M0 > 2
+#if M0 > 3
+#if defined(REINTERPRET_INPUT_AS_3D)
+        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+        acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
+        acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
+        acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
+        acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
+        acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
+        acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
+        acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
+        acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
+
+        acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
+        acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
+        acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
+        acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
+        acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
+        acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
+        acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
+        acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
+#endif // M0 > 3
+
+        src_addr.s0 += sizeof(float) * 8;
+    }
+    // float size increment
+    for(; i < (int)K; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if M0 > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // M0 > 1
+#if M0 > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // M0 > 2
+#if M0 > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // M0 > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0, b0.s1, acc0.s1);
+#if M0 > 1
+        acc1.s0 = fma(a1, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1, b0.s1, acc1.s1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2.s0 = fma(a2, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2, b0.s1, acc2.s1);
+#endif // M0 > 2
+#if M0 > 3
+        acc3.s0 = fma(a3, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3, b0.s1, acc3.s1);
+#endif // M0 > 3
+
+        src_addr.s0 += sizeof(float);
+    }
+
+    int z = get_global_id(2);
+
+    // Compute dst address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                               PARTIAL_STORE_M0) * dst_stride_y);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
+
+    LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                                PARTIAL_STORE_M0)
+                                * src2_stride_y)
+                                + z * src2_stride_z;
+
+    LOAD_BLOCK(M0, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias
+    ADD_BLOCK(M0, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store the output block
+    const bool cond_y = get_global_id(1) == 0;
+    const bool cond_x = ((get_global_id(0) + 1) * 2 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, 2, float, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
+ * @note This kernel processed a fixed number of elements along x: -DN0=8.
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                       IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                       IMAGE_DECLARATION(dst),
+                                                       uint src0_stride_z,
+                                                       uint src1_stride_z,
+#if defined(BETA)
+                                                       uint src2_stride_z,
+#endif //defined(BETA)
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                       ,
+                                                       uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                      )
+{
+    int idx = get_global_id(0) * N0;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    float8 acc0 = 0.0h;
+#if M0 > 1
+    float8 acc1 = 0.0h;
+#endif // M0 > 1
+#if M0 > 2
+    float8 acc2 = 0.0h;
+#endif // M0 > 2
+#if M0 > 3
+    float8 acc3 = 0.0h;
+#endif // M0 > 3
+
+    int i = 0;
+    for(; i <= ((int)K - 4); i += 4)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        LOAD_BLOCK(M0, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+
+        // Accumulate
+        acc0 = fma(b0, (float8)a0.s0, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (float8)a1.s0, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (float8)a2.s0, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (float8)a3.s0, acc3);
+#endif // M0 > 3
+
+        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (float8)a0.s1, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (float8)a1.s1, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (float8)a2.s1, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (float8)a3.s1, acc3);
+#endif // M0 > 3
+
+        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (float8)a0.s2, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (float8)a1.s2, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (float8)a2.s2, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (float8)a3.s2, acc3);
+#endif // M0 > 3
+
+        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (float8)a0.s3, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (float8)a1.s3, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (float8)a2.s3, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (float8)a3.s3, acc3);
+#endif // M0 > 3
+
+        src_addr.s0 += 4 * sizeof(half);
+    }
+
+    for(; i < (int)K; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if M0 > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // M0 > 1
+#if M0 > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // M0 > 2
+#if M0 > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // M0 > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+
+        src_addr += (int2)(sizeof(half), src1_stride_y);
+
+        // Accumulate
+        acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
+#if M0 > 1
+        acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
+#endif                                    // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
+#endif                                    // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
+#endif                                    // M0 > 3
+    }
+
+    int z = get_global_id(2);
+
+    // Compute dst address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+    float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, acc, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                                PARTIAL_STORE_M0)
+                                * src2_stride_y)
+                                + z * src2_stride_z;
+
+    LOAD_BLOCK(M0, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+    float8 bias_f0 = convert_float8(bias0);
+#if M0 > 1
+    float8 bias_f1 = convert_float8(bias1);
+#endif // M0 > 1
+#if M0 > 2
+    float8 bias_f2 = convert_float8(bias2);
+#endif // M0 > 2
+#if M0 > 3
+    float8 bias_f3 = convert_float8(bias3);
+#endif // M0 > 3
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias
+    ADD_BLOCK(M0, acc, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    half8 acc_h0 = convert_half8(acc0);
+#if M0 > 1
+    half8 acc_h1 = convert_half8(acc1);
+#endif // M0 > 1
+#if M0 > 2
+    half8 acc_h2 = convert_half8(acc2);
+#endif // M0 > 2
+#if M0 > 3
+    half8 acc_h3 = convert_half8(acc3);
+#endif // M0 > 3
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, half, VEC_SIZE, acc_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store the output block
+    const bool cond_y = get_global_id(1) == 0;
+    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, 8, half, acc_h, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
+ * @note This kernel processed a fixed number of elements along x: -DN0=8.
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif //defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                 ,
+                                                 uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                )
+{
+    int idx = get_global_id(0) * N0;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    half8 acc0 = 0.0h;
+#if M0 > 1
+    half8 acc1 = 0.0h;
+#endif // M0 > 1
+#if M0 > 2
+    half8 acc2 = 0.0h;
+#endif // M0 > 2
+#if M0 > 3
+    half8 acc3 = 0.0h;
+#endif // M0 > 3
+
+    int i = 0;
+    for(; i <= ((int)K - 4); i += 4)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        LOAD_BLOCK(M0, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Accumulate
+        acc0 = fma(b0, (half8)a0.s0, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (half8)a1.s0, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (half8)a2.s0, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (half8)a3.s0, acc3);
+#endif // M0 > 3
+
+        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (half8)a0.s1, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (half8)a1.s1, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (half8)a2.s1, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (half8)a3.s1, acc3);
+#endif // M0 > 3
+
+        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (half8)a0.s2, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (half8)a1.s2, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (half8)a2.s2, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (half8)a3.s2, acc3);
+#endif // M0 > 3
+
+        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (half8)a0.s3, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (half8)a1.s3, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (half8)a2.s3, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (half8)a3.s3, acc3);
+#endif // M0 > 3
+
+        src_addr.s0 += 4 * sizeof(half);
+    }
+
+    for(; i < (int)K; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if M0 > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // M0 > 1
+#if M0 > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // M0 > 2
+#if M0 > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // M0 > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+
+        src_addr += (int2)(sizeof(half), src1_stride_y);
+
+        // Accumulate
+        acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
+#if M0 > 1
+        acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
+#endif                                   // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
+#endif                                   // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
+#endif                                   // M0 > 3
+    }
+
+    int z = get_global_id(2);
+
+    // Compute dst address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, half, acc, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                                PARTIAL_STORE_M0)
+                                * src2_stride_y)
+                                + z * src2_stride_z;
+
+    LOAD_BLOCK(M0, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias
+    ADD_BLOCK(M0, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, half, VEC_SIZE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store the output block
+    const bool cond_y = get_global_id(1) == 0;
+    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, 8, half, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index b4ac00535e..50dda7ef3c 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -290,7 +290,7 @@
         (VECTOR_ACC_TYPE, k0, a, b, c);                          \
     })
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N)
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM/QASYMM_SIGNED data type.
  *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
  *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
@@ -433,7 +433,7 @@ __kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -447,7 +447,12 @@ __kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 #endif // defined(REINTERPRET_OUTPUT_AS_3D)
 
     // Convert and store output block
-    CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+    // Store output block
+    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #undef LHS_BLOCK_SIZE
 #undef LHS_OFFSET_X
@@ -456,9 +461,9 @@ __kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
 }
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
@@ -550,7 +555,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
 #endif // defined(DUMMY_WORK_ITEMS)
 
     // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
 
     // Compute RHS matrix address
     uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
@@ -567,7 +572,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
 
 #if defined(REINTERPRET_INPUT_AS_3D)
     // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply lhs_stride_z by DEPTH_GEMM3D
@@ -583,7 +588,8 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
     // Initialize the accumulators
     REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
 
-    for(int i = 0; i < K; i += K0)
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
     {
         // Load values from LHS matrix
         LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
@@ -597,14 +603,26 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
         lhs_offset += K0;
         rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
     }
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS reshaped matrix
+        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
 
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+        ARM_MM_K0XN0XM0(M0, N0, 1, a, b, c);
+        lhs_offset += 1;
+        rhs_offset += 1;
+    }
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
 
     REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -618,7 +636,12 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
 #endif // defined(REINTERPRET_OUTPUT_AS_3D)
 
     // Convert and store output block
-    CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Store output block
+    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -764,7 +787,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAG
 #endif // defined(DUMMY_WORK_ITEMS)
 
     // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
 
     // Compute RHS matrix address
     uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
@@ -781,7 +804,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAG
 
 #if defined(REINTERPRET_INPUT_AS_3D)
     // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply lhs_stride_z by DEPTH_GEMM3D
@@ -797,7 +820,8 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAG
     // Initialize the accumulators
     REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
 
-    for(int i = 0; i < K; i += K0)
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
     {
         // Load values from LHS matrix
         LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
@@ -811,15 +835,27 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAG
         lhs_offset += K0;
         rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
     }
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS reshaped matrix
+        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
 
+        ARM_MM_K0XN0XM0(M0, N0, 1, a, b, c);
+        lhs_offset += 1;
+        rhs_offset += 1;
+    }
     // Result of MM is of type DATA_TYPE
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(DATA_TYPE) + (y * (uint)M0 * dst_stride_y);
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
 
     REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -857,7 +893,7 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAG
     // Note: The sum_row tensor is generated through CLGEMMLowpMatrixAReductionKernel which
     // does not introduce paddings. For this reason is safe to access the tensor in this manner
     // without considering that the coordinate "y" could come from an input 3D tensor
-    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + (y * (uint)M0) * sizeof(int) + z * sum_row_stride_y;
+    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + (COMPUTE_M0_START_ROW(y, (uint)M0, PARTIAL_STORE_M0)) * sizeof(int) + z * sum_row_stride_y;
 
     LOAD_SCALAR_AS_VECTOR(M0, N0, int, b_offset_s32_, sum_row_addr, 0, sum_row_stride_x);
 
@@ -906,17 +942,22 @@ __kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(IMAG
     REPEAT_MIN_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
-    // Convert and store output block (does convert saturate)
-    CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c_int, dst_addr, dst_stride_y, zout);
+    // Convert and store output block
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Store output block
+    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c_int, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
 }
 #endif // defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(K)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 
-#if defined(M0) && defined(N0) && defined(K0) && defined(K)
+#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 
 /** This OpenCL kernel computes the matrix multiplication between 2 matrices.
  *  The LHS matrix is NOT reshaped
@@ -992,10 +1033,10 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
 #endif // defined(DUMMY_WORK_ITEMS)
 
     // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
 
     // Compute RHS matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0;
+    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
 
 #if defined(MATRIX_B_DEPTH)
     // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
@@ -1009,7 +1050,7 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
 
 #if defined(REINTERPRET_INPUT_AS_3D)
     // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply lhs_stride_z by DEPTH_GEMM3D
@@ -1074,13 +1115,13 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
         rhs_offset += rhs_stride_y;
     }
 
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
 
     REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -1092,11 +1133,14 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
     dst_addr += z * dst_stride_z;
 
 #endif // defined(REINTERPRET_OUTPUT_AS_3D)
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
 
     // Convert and store output block
-    CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+    REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); // resN = CONVERT(cN, VEC_DATA_TYPE(int, N0));
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 }
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(K)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 
 #if defined(COLS_A)
 /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
@@ -1236,7 +1280,7 @@ __kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 #endif // defined(COLS_A)
 
-#if defined(COLS_B) && defined(ROWS_B)
+#if defined(COLS_B) && defined(ROWS_B) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
 /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
  * It is also possible to multiply each reduced column by a scalar value, if SCALAR is passed at compile time.
  *
@@ -1247,6 +1291,8 @@ __kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
  * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
  * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
  * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e. -DSCALAR=3)
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -1267,29 +1313,30 @@ __kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
                                           IMAGE_DECLARATION(dst))
 {
     // Compute source and destination addresses
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const uint y      = get_global_id(1);
 
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
-    sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))0;
+    __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + y * src_step_y + y * src_stride_z);
+    __global uchar *dst_addr           = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + y * dst_stride_y;
 
-    __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src.ptr + get_global_id(1) * src_stride_z);
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))0;
 
     int i = 0;
     // This for loop performs 4 accumulations
     for(; i <= ((int)ROWS_B - 4); i += 4)
     {
-        const VEC_DATA_TYPE(DATA_TYPE, 16)
-        b0 = vload16(0, matrix_b + 0 * src_stride_y);
-        const VEC_DATA_TYPE(DATA_TYPE, 16)
-        b1 = vload16(0, matrix_b + 1 * src_stride_y);
-        const VEC_DATA_TYPE(DATA_TYPE, 16)
-        b2 = vload16(0, matrix_b + 2 * src_stride_y);
-        const VEC_DATA_TYPE(DATA_TYPE, 16)
-        b3 = vload16(0, matrix_b + 3 * src_stride_y);
-
-        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) + CONVERT(b3, VEC_DATA_TYPE(ACC_DATA_TYPE,
-                      16));
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b0 = VLOAD(VEC_SIZE)(0, matrix_b + 0 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b1 = VLOAD(VEC_SIZE)(0, matrix_b + 1 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b2 = VLOAD(VEC_SIZE)(0, matrix_b + 2 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b3 = VLOAD(VEC_SIZE)(0, matrix_b + 3 * src_stride_y);
+
+        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b3,
+                      VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
 
         matrix_b += 4 * src_stride_y;
     }
@@ -1297,25 +1344,29 @@ __kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
     // This for loop perfoms the leftover accumulations
     for(; i < (int)ROWS_B; ++i)
     {
-        const VEC_DATA_TYPE(DATA_TYPE, 16)
-        b0 = vload16(0, matrix_b);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b0 = VLOAD(VEC_SIZE)(0, matrix_b);
 
-        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
 
         matrix_b += src_stride_y;
     }
 
 #if defined(SCALAR)
-    sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))SCALAR;
+    sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))SCALAR;
 #endif // defined(SCALAR)
-    VSTORE(16)
-    (convert_int16(sum_col_32), 0, (__global int *)dst.ptr);
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    res0 = CONVERT(sum_col_32, VEC_DATA_TYPE(int, VEC_SIZE));
+
+    STORE_VECTOR_SELECT(res, int, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
-#endif // defined(COLS_B) && defined(ROWS_B)
+#endif // defined(COLS_B) && defined(ROWS_B) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
 
 #endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
 
-#if defined(K_OFFSET)
+#if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
 
 /* Helper function used to calculate the offset contribution after matrix multiplication.
  *
@@ -1326,8 +1377,10 @@ __kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
  * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
  * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
  * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
- * @param[in] x                                     get_global_id(0) * 4
+ * @param[in] x                                     max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0)
  * @param[in] y                                     get_global_id(1)
  * @param[in] z                                     get_global_id(2)
  * @param[in] sum_col_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
@@ -1347,7 +1400,7 @@ __kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
  * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases tensor
  */
-inline int4 offset_contribution(
+inline VEC_INT offset_contribution(
     int x,
     int y,
     int z
@@ -1365,8 +1418,8 @@ inline int4 offset_contribution(
 #endif // defined(ADD_BIAS)
 )
 {
-    int4 a_offset_s32 = (int4)0;
-    int4 b_offset_s32 = (int4)0;
+    VEC_INT a_offset_s32 = (VEC_INT)0;
+    VEC_INT b_offset_s32 = (VEC_INT)0;
 
     int batch_id = z;
 #if defined(DEPTH_INPUT3D)
@@ -1379,12 +1432,12 @@ inline int4 offset_contribution(
 
     // Compute the offset contribution due to A_OFFSET
 #if defined(SUM_COL_HAS_BATCHES)
-    a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
+    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
 #else  // defined(SUM_COL_HAS_BATCHES)
-    a_offset_s32 = vload4(0, (__global int *)sum_col_addr);
+    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)sum_col_addr);
 #endif // defined(SUM_COL_HAS_BATCHES)
 
-    a_offset_s32 *= (int4)A_OFFSET;
+    a_offset_s32 *= (VEC_INT)A_OFFSET;
 #endif // defined(A_OFFSET)
 
 #if defined(B_OFFSET)
@@ -1393,22 +1446,22 @@ inline int4 offset_contribution(
 
     // Compute the offset contribution due to B_OFFSET
 #if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
-    b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
+    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
 #else  // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
-    b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
+    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
 #endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
-    b_offset_s32 *= (int4)B_OFFSET;
+    b_offset_s32 *= (VEC_INT)B_OFFSET;
 #endif // defined(B_OFFSET)
 
 #if defined(ADD_BIAS)
     // Add bias
     __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
 
-    int4 biases_values = vload4(0, (__global int *)bias_addr);
-    b_offset_s32 += (int4)biases_values;
+    VEC_INT biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    b_offset_s32 += (VEC_INT)biases_values;
 #endif // defined(ADD_BIAS)
 
-    return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
+    return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32;
 }
 
 /* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is performed in-place
@@ -1420,6 +1473,8 @@ inline int4 offset_contribution(
  * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
  * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
  * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * The final result is:
  *
@@ -1468,49 +1523,49 @@ __kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
 #endif // defined(ADD_BIAS))
                                           )
 {
-    const int x = get_global_id(0) * 4;
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     const int y = get_global_id(1);
     const int z = get_global_id(2);
 
     // Compute offset contribution
-    int4 offset_term_s32 = offset_contribution(
-                               x, y, z
+    VEC_INT offset_term_s32 = offset_contribution(
+                                  x, y, z
 #if defined(A_OFFSET)
-                               ,
-                               sum_col_ptr,
-                               sum_col_stride_x,
-                               sum_col_step_x,
-                               sum_col_stride_y,
-                               sum_col_step_y,
-                               sum_col_offset_first_element_in_bytes
+                                  ,
+                                  sum_col_ptr,
+                                  sum_col_stride_x,
+                                  sum_col_step_x,
+                                  sum_col_stride_y,
+                                  sum_col_step_y,
+                                  sum_col_offset_first_element_in_bytes
 #endif // defined(A_OFFSET)
 #if defined(B_OFFSET)
-                               ,
-                               sum_row_ptr,
-                               sum_row_stride_x,
-                               sum_row_step_x,
-                               sum_row_stride_y,
-                               sum_row_step_y,
-                               sum_row_offset_first_element_in_bytes
+                                  ,
+                                  sum_row_ptr,
+                                  sum_row_stride_x,
+                                  sum_row_step_x,
+                                  sum_row_stride_y,
+                                  sum_row_step_y,
+                                  sum_row_offset_first_element_in_bytes
 #endif // defined(B_OFFSET)
 #if defined(ADD_BIAS)
-                               ,
-                               biases_ptr,
-                               biases_stride_x,
-                               biases_step_x,
-                               biases_offset_first_element_in_bytes
+                                  ,
+                                  biases_ptr,
+                                  biases_stride_x,
+                                  biases_step_x,
+                                  biases_offset_first_element_in_bytes
 #endif // defined(ADD_BIAS)
-                           );
+                              );
 
     __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
 
-    int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+    VEC_INT in_s32_0 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
 
     // Add the offset terms to GEMM's result
-    in_s32 += offset_term_s32;
+    in_s32_0 += offset_term_s32;
 
     // Store the result with the offset contribution
-    vstore4(in_s32, 0, (__global int *)mm_result_addr);
+    STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 
 #if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE)
@@ -1548,6 +1603,8 @@ __kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
  * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
  * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
  *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
  * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
@@ -1611,45 +1668,45 @@ __kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm
 #endif // defined(PER_CHANNEL_QUANTIZATION)
                                                         )
 {
-    const int x = get_global_id(0) * 4;
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     const int y = get_global_id(1);
     const int z = get_global_id(2);
 
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
 
     // Compute offset contribution
-    int4 offset_term_s32 = offset_contribution(
-                               x, y, z
+    VEC_INT offset_term_s32 = offset_contribution(
+                                  x, y, z
 #if defined(A_OFFSET)
-                               ,
-                               sum_col_ptr,
-                               sum_col_stride_x,
-                               sum_col_step_x,
-                               sum_col_stride_y,
-                               sum_col_step_y,
-                               sum_col_offset_first_element_in_bytes
+                                  ,
+                                  sum_col_ptr,
+                                  sum_col_stride_x,
+                                  sum_col_step_x,
+                                  sum_col_stride_y,
+                                  sum_col_step_y,
+                                  sum_col_offset_first_element_in_bytes
 #endif // defined(A_OFFSET)
 #if defined(B_OFFSET)
-                               ,
-                               sum_row_ptr,
-                               sum_row_stride_x,
-                               sum_row_step_x,
-                               sum_row_stride_y,
-                               sum_row_step_y,
-                               sum_row_offset_first_element_in_bytes
+                                  ,
+                                  sum_row_ptr,
+                                  sum_row_stride_x,
+                                  sum_row_step_x,
+                                  sum_row_stride_y,
+                                  sum_row_step_y,
+                                  sum_row_offset_first_element_in_bytes
 #endif // defined(B_OFFSET)
 #if defined(ADD_BIAS)
-                               ,
-                               biases_ptr,
-                               biases_stride_x,
-                               biases_step_x,
-                               biases_offset_first_element_in_bytes
+                                  ,
+                                  biases_ptr,
+                                  biases_stride_x,
+                                  biases_step_x,
+                                  biases_offset_first_element_in_bytes
 #endif // defined(ADD_BIAS)
-                           );
+                              );
 
     __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
 
-    int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
 
     // Add the offset terms to GEMM's result
     in_s32 += offset_term_s32;
@@ -1657,14 +1714,14 @@ __kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm
     // -------------- OUTPUT STAGE
 
     // Add the offset terms to GEMM's result
-    in_s32 += (int4)RESULT_OFFSET;
+    in_s32 += (VEC_INT)RESULT_OFFSET;
 
     // Multiply by result_mult_int and shift
 #if defined(PER_CHANNEL_QUANTIZATION)
     __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
     __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
-    int4            result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
-    int4            result_shifts_values      = vload4(0, (__global int *)result_shifts_addr);
+    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
+    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
 
     in_s32 *= result_multipliers_values;
     in_s32 >>= result_shifts_values;
@@ -1674,18 +1731,18 @@ __kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm
     in_s32 >>= RESULT_SHIFT;
 #endif // defined(PER_CHANNEL_QUANTIZATION)
 
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
-    res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
 
 #if defined(MIN_BOUND)
-    res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 
 /* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes down to uint8.
@@ -1722,6 +1779,8 @@ __kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm
  * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
  * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
  *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
  * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
@@ -1747,7 +1806,7 @@ __kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm
  * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
  * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
- * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
  * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
@@ -1785,45 +1844,45 @@ __kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DEC
 #endif // defined(PER_CHANNEL_QUANTIZATION)
                                                                    )
 {
-    const int x = get_global_id(0) * 4;
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     const int y = get_global_id(1);
     const int z = get_global_id(2);
 
     // Compute offset contribution
-    int4 offset_term_s32 = offset_contribution(
-                               x, y, z
+    VEC_INT offset_term_s32 = offset_contribution(
+                                  x, y, z
 #if defined(A_OFFSET)
-                               ,
-                               sum_col_ptr,
-                               sum_col_stride_x,
-                               sum_col_step_x,
-                               sum_col_stride_y,
-                               sum_col_step_y,
-                               sum_col_offset_first_element_in_bytes
+                                  ,
+                                  sum_col_ptr,
+                                  sum_col_stride_x,
+                                  sum_col_step_x,
+                                  sum_col_stride_y,
+                                  sum_col_step_y,
+                                  sum_col_offset_first_element_in_bytes
 #endif // defined(A_OFFSET)
 #if defined(B_OFFSET)
-                               ,
-                               sum_row_ptr,
-                               sum_row_stride_x,
-                               sum_row_step_x,
-                               sum_row_stride_y,
-                               sum_row_step_y,
-                               sum_row_offset_first_element_in_bytes
+                                  ,
+                                  sum_row_ptr,
+                                  sum_row_stride_x,
+                                  sum_row_step_x,
+                                  sum_row_stride_y,
+                                  sum_row_step_y,
+                                  sum_row_offset_first_element_in_bytes
 #endif // defined(B_OFFSET)
 #if defined(ADD_BIAS)
-                               ,
-                               biases_ptr,
-                               biases_stride_x,
-                               biases_step_x,
-                               biases_offset_first_element_in_bytes
+                                  ,
+                                  biases_ptr,
+                                  biases_stride_x,
+                                  biases_step_x,
+                                  biases_offset_first_element_in_bytes
 #endif // defined(ADD_BIAS)
-                           );
+                              );
 
     __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
 
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
 
-    int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
 
     // Add the offset terms to GEMM's result
     in_s32 += offset_term_s32;
@@ -1834,41 +1893,43 @@ __kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DEC
 #if defined(PER_CHANNEL_QUANTIZATION)
     __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
     __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
-    int4            result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
-    int4            result_shifts_values      = vload4(0, (__global int *)result_shifts_addr);
+    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
+    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
 
-    int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
-    int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
-    in_s32                = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
+    VEC_INT in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
+    VEC_INT in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
+    in_s32                   = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
 #else // defined(PER_CHANNEL_QUANTIZATION)
 
 #if RESULT_SHIFT < 0
-    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
 #else  // RESULT_SHIFT >= 0
-    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
 #endif // RESULT_SHIFT < 0
 
 #endif // defined(PER_CHANNEL_QUANTIZATION)
 
     // Add the offset terms to GEMM's result
-    in_s32 += (int4)RESULT_OFFSET;
+    in_s32 += (VEC_INT)RESULT_OFFSET;
 
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
-    res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
 
 #if defined(MIN_BOUND)
-    res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE)
 
-#endif // defined(K_OFFSET)
+#undef VEC_INT
+
+#endif // defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
 
 #if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
 /** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
@@ -1891,6 +1952,7 @@ __kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DEC
  * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
  * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
  *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
  * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
@@ -1920,7 +1982,7 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
                                                   TENSOR3D_DECLARATION(dst))
 {
     // Compute source and destination addresses
-    int x = get_global_id(0) * 4;
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     int y = get_global_id(1);
     int z = get_global_id(2);
 
@@ -1928,18 +1990,20 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
 
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
 
-    int4 input_values = vload4(0, (__global int *)src_addr);
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
 
 #if defined(ADD_BIAS)
     // Add bias
     __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
 
-    int4 biases_values = vload4(0, (__global int *)bias_addr);
-    input_values += (int4)biases_values;
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
 #endif // defined(ADD_BIAS)
 
     // Add the offset terms to GEMM's result
-    input_values += (int4)RESULT_OFFSET;
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET;
 
     // Multiply by result_mult_int and shift
     input_values *= RESULT_MULT_INT;
@@ -1950,18 +2014,18 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
     input_values >>= RESULT_SHIFT;
 #endif // RESULT_SHIFT < 0
 
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
-    res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
 
 #if defined(MIN_BOUND)
-    res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
 
@@ -1986,6 +2050,8 @@ __kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
  * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
  * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
  *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
  * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
@@ -2015,7 +2081,7 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO
                                                              TENSOR3D_DECLARATION(dst))
 {
     // Compute source and destination addresses
-    int x = get_global_id(0) * 4;
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     int y = get_global_id(1);
     int z = get_global_id(2);
 
@@ -2023,38 +2089,40 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO
 
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
 
-    int4 input_values = vload4(0, (__global int *)src_addr);
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
 
 #if defined(ADD_BIAS)
     // Add bias
     __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
 
-    int4 biases_values = vload4(0, (__global int *)bias_addr);
-    input_values += (int4)biases_values;
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
 #endif // defined(ADD_BIAS)
 
     // Multiply by result_mult_int and shift
 #if RESULT_SHIFT < 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
 #else  // RESULT_SHIFT >= 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
 #endif // RESULT_SHIFT < 0
 
     // Add the offset terms to GEMM's result
-    input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT;
 
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
-    res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
 
 #if defined(MIN_BOUND)
-    res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
 
@@ -2077,6 +2145,8 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATIO
  * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
  * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
  *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
  * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
@@ -2106,42 +2176,45 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
                                                                      TENSOR3D_DECLARATION(dst))
 {
     // Compute source and destination addresses
-    int x = get_global_id(0) * 4;
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     int y = get_global_id(1);
     int z = get_global_id(2);
 
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
 
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * 2 + y * dst_stride_y + z * dst_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z;
 
-    int4 input_values = vload4(0, (__global int *)src_addr);
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
 
 #if defined(ADD_BIAS)
     // Add bias
     __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
 
-    int4 biases_values = vload4(0, (__global int *)bias_addr);
-    input_values += (int4)biases_values;
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
 #endif // defined(ADD_BIAS)
 
     // Multiply by result_mult_int and shift
 #if RESULT_SHIFT < 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
 #else  // RESULT_SHIFT >= 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
 #endif // RESULT_SHIFT < 0
 
-    short4 res = convert_short4_sat(input_values);
+    VEC_DATA_TYPE(short, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE));
 
 #if defined(MIN_BOUND)
-    res = max(res, (short4)MIN_BOUND);
+    res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res = min(res, (short4)MAX_BOUND);
+    res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    vstore4(res, 0, (__global short *)dst_addr);
+    STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
 
@@ -2166,6 +2239,8 @@ __kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DE
  * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
  * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
  *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  *
  * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
  * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
@@ -2201,7 +2276,7 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
 #endif // defined(DST_HEIGHT)
 {
     // Compute source and destination addresses
-    int x = get_global_id(0) * 4;
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
     int y = get_global_id(1);
     int z = get_global_id(2);
 
@@ -2209,13 +2284,15 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
 
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
 
-    int4 input_values = vload4(0, (__global int *)src_addr);
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
 
 #if defined(ADD_BIAS)
     // Add bias
     __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
 
-    int4 biases_values = vload4(0, (__global int *)bias_addr);
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
     input_values += (int4)biases_values;
 #endif // defined(ADD_BIAS)
 
@@ -2223,17 +2300,17 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
     float4 input_values_f = convert_float4(input_values);
     input_values_f        = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
 
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
-    res = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
 
 #if defined(MIN_BOUND)
-    res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 7b08233029..372ccd91fb 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -24,6 +24,8 @@
 #ifndef ARM_COMPUTE_HELPER_H
 #define ARM_COMPUTE_HELPER_H
 
+#include "load_store_utility.h"
+
 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
@@ -170,12 +172,12 @@
  * @return The vector filled with offset values
  * @{
  */
-#define V_OFFS1(dt) (dt)(0)
-#define V_OFFS2(dt) (dt)(0, 1)
-#define V_OFFS3(dt) (dt)(0, 1, 3)
-#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
-#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
-#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+#define V_OFFS1(dt) (dt##1)(0)
+#define V_OFFS2(dt) (dt##2)(0, 1)
+#define V_OFFS3(dt) (dt##3)(0, 1, 2)
+#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
 /** @} */ // end of group V_OFFSn
 
 /** Create a vector filled with offset values corresponding to the location of each element.
@@ -273,21 +275,84 @@
 #define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
 #define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
 
+#define NO_STORE(data, offs, ptr) \
+    {                             \
+    }
+
 // Size == 1 (scalar)
+#define vstore_partial_1_0 NO_STORE
 #define vstore_partial_1_1 vstore1
+#define vstore_partial_1_2 NO_STORE
+#define vstore_partial_1_3 NO_STORE
+#define vstore_partial_1_4 NO_STORE
+#define vstore_partial_1_5 NO_STORE
+#define vstore_partial_1_6 NO_STORE
+#define vstore_partial_1_7 NO_STORE
+#define vstore_partial_1_8 NO_STORE
+#define vstore_partial_1_9 NO_STORE
+#define vstore_partial_1_10 NO_STORE
+#define vstore_partial_1_11 NO_STORE
+#define vstore_partial_1_12 NO_STORE
+#define vstore_partial_1_13 NO_STORE
+#define vstore_partial_1_14 NO_STORE
+#define vstore_partial_1_15 NO_STORE
+#define vstore_partial_1_16 NO_STORE
 // Size == 2
+#define vstore_partial_2_0 NO_STORE
 #define vstore_partial_2_1 vstore_partial_1
 #define vstore_partial_2_2 vstore_partial_2
+#define vstore_partial_2_3 NO_STORE
+#define vstore_partial_2_4 NO_STORE
+#define vstore_partial_2_5 NO_STORE
+#define vstore_partial_2_6 NO_STORE
+#define vstore_partial_2_7 NO_STORE
+#define vstore_partial_2_8 NO_STORE
+#define vstore_partial_2_9 NO_STORE
+#define vstore_partial_2_10 NO_STORE
+#define vstore_partial_2_11 NO_STORE
+#define vstore_partial_2_12 NO_STORE
+#define vstore_partial_2_13 NO_STORE
+#define vstore_partial_2_14 NO_STORE
+#define vstore_partial_2_15 NO_STORE
+#define vstore_partial_2_16 NO_STORE
 // Size == 3
+#define vstore_partial_3_0 NO_STORE
 #define vstore_partial_3_1 vstore_partial_1
 #define vstore_partial_3_2 vstore_partial_2
 #define vstore_partial_3_3 vstore_partial_3
+#define vstore_partial_3_4 NO_STORE
+#define vstore_partial_3_5 NO_STORE
+#define vstore_partial_3_6 NO_STORE
+#define vstore_partial_3_7 NO_STORE
+#define vstore_partial_3_8 NO_STORE
+#define vstore_partial_3_9 NO_STORE
+#define vstore_partial_3_10 NO_STORE
+#define vstore_partial_3_11 NO_STORE
+#define vstore_partial_3_12 NO_STORE
+#define vstore_partial_3_13 NO_STORE
+#define vstore_partial_3_14 NO_STORE
+#define vstore_partial_3_15 NO_STORE
+#define vstore_partial_3_16 NO_STORE
 // Size == 4
+#define vstore_partial_4_0 NO_STORE
 #define vstore_partial_4_1 vstore_partial_1
 #define vstore_partial_4_2 vstore_partial_2
 #define vstore_partial_4_3 vstore_partial_3
 #define vstore_partial_4_4 vstore_partial_4
+#define vstore_partial_4_5 NO_STORE
+#define vstore_partial_4_6 NO_STORE
+#define vstore_partial_4_7 NO_STORE
+#define vstore_partial_4_8 NO_STORE
+#define vstore_partial_4_9 NO_STORE
+#define vstore_partial_4_10 NO_STORE
+#define vstore_partial_4_11 NO_STORE
+#define vstore_partial_4_12 NO_STORE
+#define vstore_partial_4_13 NO_STORE
+#define vstore_partial_4_14 NO_STORE
+#define vstore_partial_4_15 NO_STORE
+#define vstore_partial_4_16 NO_STORE
 // Size == 8
+#define vstore_partial_8_0 NO_STORE
 #define vstore_partial_8_1 vstore_partial_1
 #define vstore_partial_8_2 vstore_partial_2
 #define vstore_partial_8_3 vstore_partial_3
@@ -296,7 +361,16 @@
 #define vstore_partial_8_6 vstore_partial_6
 #define vstore_partial_8_7 vstore_partial_7
 #define vstore_partial_8_8 vstore_partial_8
+#define vstore_partial_8_9 NO_STORE
+#define vstore_partial_8_10 NO_STORE
+#define vstore_partial_8_11 NO_STORE
+#define vstore_partial_8_12 NO_STORE
+#define vstore_partial_8_13 NO_STORE
+#define vstore_partial_8_14 NO_STORE
+#define vstore_partial_8_15 NO_STORE
+#define vstore_partial_8_16 NO_STORE
 // Size == 16
+#define vstore_partial_16_0 NO_STORE
 #define vstore_partial_16_1 vstore_partial_1
 #define vstore_partial_16_2 vstore_partial_2
 #define vstore_partial_16_3 vstore_partial_3
@@ -376,15 +450,15 @@
 
 #define vstore_partial_13(DATA, OFFSET, PTR)       \
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
-    vstore_partial_5(DATA.s89abc, OFFSET, PTR + 8);
+    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
 
 #define vstore_partial_14(DATA, OFFSET, PTR)       \
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
-    vstore_partial_6(DATA.s89abcd, OFFSET, PTR + 8);
+    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
 
 #define vstore_partial_15(DATA, OFFSET, PTR)       \
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
-    vstore_partial_7(DATA.s89abcde, OFFSET, PTR + 8);
+    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
 
 #define vstore_partial_16(DATA, OFFSET, PTR) \
     vstore16(DATA, OFFSET, PTR);
@@ -433,9 +507,6 @@
 #define VEC_DATA_TYPE_STR(type, size) type##size
 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
 
-#define CL_VEC_DATA_TYPE_STR(type, size) type##size
-#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
-
 #define CONVERT_STR(x, type) (convert_##type((x)))
 #define CONVERT(x, type) CONVERT_STR(x, type)
 
@@ -445,6 +516,41 @@
 #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
 #define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
 
+#define select_vec_dt_uchar(size) uchar##size
+#define select_vec_dt_char(size) char##size
+#define select_vec_dt_ushort(size) ushort##size
+#define select_vec_dt_short(size) short##size
+#define select_vec_dt_half(size) short##size
+#define select_vec_dt_uint(size) uint##size
+#define select_vec_dt_int(size) int##size
+#define select_vec_dt_float(size) int##size
+#define select_vec_dt_ulong(size) ulong##size
+#define select_vec_dt_long(size) long##size
+
+#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
+#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
+#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
+
+#define sum_reduce_1(x) (x)
+#define sum_reduce_2(x) ((x).s0) + ((x).s1)
+#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
+#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
+#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
+#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
+
+#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
+#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
+
+#define max_reduce_1(x) (x)
+#define max_reduce_2(x) max(((x).s0), ((x).s1))
+#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
+#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
+#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
+#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
+
+#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
+#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
+
 #define VECTOR_DECLARATION(name)     \
     __global uchar *name##_ptr,      \
     uint        name##_stride_x, \
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index 70134af6ee..59c8fa606d 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -123,8 +123,8 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
         VEC_DATA_TYPE(int, size)                                                                                                        \
         mask = (one << exponent) - one;                                                                                                 \
         VEC_DATA_TYPE(int, size)                                                                                                        \
-        threshold = (mask >> 1) + select(zero, one, x < 0);                                                                             \
-        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                                             \
+        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
+        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
     }
 
 /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -153,12 +153,12 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
         VEC_DATA_TYPE(long, size)                                                                            \
         is_positive_or_zero = ab_64 >= 0;                                                                    \
         VEC_DATA_TYPE(long, size)                                                                            \
-        nudge = select(mask2, mask1, is_positive_or_zero);                                                   \
+        nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero));               \
         VEC_DATA_TYPE(long, size)                                                                            \
         mask = 1ll << 31;                                                                                    \
         VEC_DATA_TYPE(int, size)                                                                             \
         ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                                            \
-        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+        return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow));                   \
     }
 
 /** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
@@ -216,7 +216,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
     {                                                                                    \
         const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
         const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
-        return select(all_zeros, all_ones, a == 0);                                      \
+        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0));   \
     }
 
 /** For each element of input vector, the corresponding bits of the result item are set
@@ -231,7 +231,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
     {                                                                                        \
         const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
         const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
-        return select(all_zeros, all_ones, a != 0);                                          \
+        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
     }
 
 #define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
@@ -338,7 +338,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
         const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
         const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
         VEC_DATA_TYPE(long, size)                                                                                         \
-        sign = select(minus_one, one, sum >= 0);                                                                          \
+        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
         return convert_int##size((sum + sign) / 2);                                                                       \
     }
 
@@ -390,8 +390,10 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
 #define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
 #define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
 
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
-#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
+#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
     ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
@@ -401,11 +403,14 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
 #define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
 #define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
 #define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
-#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
 #define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
 #define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
-#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
 
 #define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
     inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
@@ -446,73 +451,91 @@ DEQUANTIZE_IMPL(int, 16)
 
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
 
 ASYMM_MULT_IMPL(1)
 ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(3)
 ASYMM_MULT_IMPL(4)
 ASYMM_MULT_IMPL(8)
 ASYMM_MULT_IMPL(16)
 
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1)
 ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3)
 ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
 ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
 ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
 
 ASYMM_SELECT_USING_MASK_IMPL(1)
 ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(3)
 ASYMM_SELECT_USING_MASK_IMPL(4)
 ASYMM_SELECT_USING_MASK_IMPL(8)
 ASYMM_SELECT_USING_MASK_IMPL(16)
 
 ASYMM_MASK_IF_ZERO_IMPL(1)
 ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(3)
 ASYMM_MASK_IF_ZERO_IMPL(4)
 ASYMM_MASK_IF_ZERO_IMPL(8)
 ASYMM_MASK_IF_ZERO_IMPL(16)
 
 ASYMM_MASK_IF_NON_ZERO_IMPL(1)
 ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(3)
 ASYMM_MASK_IF_NON_ZERO_IMPL(4)
 ASYMM_MASK_IF_NON_ZERO_IMPL(8)
 ASYMM_MASK_IF_NON_ZERO_IMPL(16)
 
+EXP_BARREL_SHIFTER_IMPL(1)
 EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(3)
 EXP_BARREL_SHIFTER_IMPL(4)
 EXP_BARREL_SHIFTER_IMPL(8)
 EXP_BARREL_SHIFTER_IMPL(16)
 
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1)
 ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3)
 ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
 ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
 ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
 
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
 
+ASYMM_ROUNDING_HALF_SUM_IMPL(1)
 ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(3)
 ASYMM_ROUNDING_HALF_SUM_IMPL(4)
 ASYMM_ROUNDING_HALF_SUM_IMPL(8)
 ASYMM_ROUNDING_HALF_SUM_IMPL(16)
 
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1)
 ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3)
 ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
 ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
 ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
 
 ASYMM_RESCALE_IMPL(1)
 ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(3)
 ASYMM_RESCALE_IMPL(4)
 ASYMM_RESCALE_IMPL(8)
 ASYMM_RESCALE_IMPL(16)
 
 MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
 MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3)
 MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
 MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
 MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
diff --git a/src/core/CL/cl_kernels/load_store_utility.h b/src/core/CL/cl_kernels/load_store_utility.h
new file mode 100644
index 0000000000..56b1538c6f
--- /dev/null
+++ b/src/core/CL/cl_kernels/load_store_utility.h
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/** Store the 0 to (n-1)th rows of the given variables
+ * @name STORE_ROW_n
+ *
+ * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                 \
+    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
+    VSTORE(N0)                                                  \
+    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+/** @} */ // end of groupd STORE_ROW_n
+
+/** Convert and store the 0th to (n-1)th rows of the given variables
+ * @name CONVERT_STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                         \
+    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                     \
+    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                          \
+    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+
+/** @} */ // end of groupd CONVERT_STORE_ROW_n
+
+/** Store a block of the given size M0xN0
+ * @name STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group STORE_BLOCK
+
+/** Convert and store a block of the given size M0xN0
+ * @name CONVERT_STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group CONVERT_STORE_BLOCK
+
+/** Partially store the 0 to (n-1)th rows of the given variables
+ * @name STORE_ROW_PARTIAL_n
+ * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0
+ *
+ * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] STORE_N0  The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+/** @} */ // end of groupd STORE_ROW_PARTIAL_n
+
+/** Partially store a block of the given size STORE_M0xSTORE_N0
+ * @name STORE_BLOCK_PARTIAL
+ *
+ * @note The vector width @p N0 is also required for correct partial storing behaviour.
+ * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] STORE_M0  The number of rows to store. Supported: 1-16
+ * @param[in] STORE_N0  The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0
+ * @param[in] N0        The size of each vector. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** Store a block that can be partial in both x and y dimensions
+ *
+ * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
+ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
+ */
+#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
+    {                                                                                                                                                     \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
+    }                                                                                                                                                     \
+    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
+    {                                                                                                                                                     \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
+    }                                                                                                                                                     \
+    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
+    {                                                                                                                                                     \
+        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
+    }                                                                                                                                                     \
+    else                                                                                                                                                  \
+    {                                                                                                                                                     \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
+    }
+/** Store a block that can only be partial in x but not y.
+ *
+ * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
+ */
+#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
+    if(!(PARTIAL_COND_X))                                                                                         \
+    {                                                                                                             \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
+    }
+/** Store a block that can only be partial in y but not x.
+ *
+ * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
+ */
+#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
+    if(!(PARTIAL_COND_Y))                                                                                         \
+    {                                                                                                             \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
+    }                                                                                                             \
+    else                                                                                                          \
+    {                                                                                                             \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
+    }
+/** @} */ // end of group STORE_BLOCK_PARTIAL
+
+#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+
+/** Boundary-aware GEMM block store
+ * @name STORE_BLOCK_BOUNDARY_AWARE
+ * This macro assumes the following schemes to achieve boundary-awareness:
+ *  - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
+ *  - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
+ *  - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
+ * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim.
+ *
+ * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
+ * blocks **at the end**.
+ * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
+ * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
+ *
+ *  *--x-->                         x == 0                        x == 1
+ *  |                  |<------------------------------N-------------------------->|
+ *  y                  |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
+ *  |     -------------#############################################################
+ *  *     |          | |...............................|...........................|
+ * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
+ *        |          | |...............................|...........................|
+ *        M          --#############################################################
+ *        |          | |                               |...........................|
+ * y == 1 |         M0 |      Non-boundary block       |....Boundary block in x....|
+ *        |          | |                               |...........................|
+ *        |------------#############################################################
+ *
+ * Then @p PARTIAL_STORE_M0 = M % M0      and @p PARTIAL_STORE_N0 = N % N0
+ *
+ * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
+ * and select corresponding store methods such that the boundary detection logic is only added when needed.
+ *
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
+ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
+ * @{
+ */
+#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+// Case1: No partial blocks in either x or y
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+
+#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
+// Case2: Partial blocks in y
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
+
+#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
+// Case3: Partial blocks in x
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
+
+#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+// Case4: Partial blocks in both x and y
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+
+#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+
+#endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE
+
+#if defined(PARTIAL_STORE_M0)
+/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding
+ * @name COMPUTE_M0_START_ROW
+ * If there're any partial blocks in y dimension, they are placed at the beginning of the rows.
+ * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent
+ * blocks in the y dimension to avoid any padding.
+ * EG: M0=4, PARTIAL_STORE_M0=1:
+ *                  | Non-overlapping | +M0_ROW_SHIFT (Overlapping)
+ * block 0 (partial)| start row = 0   | start row = 0
+ * block 1 (full)   | start row = 4   | start row = 1
+ * block 2 (full)   | start row = 8   | start row = 5
+ *
+ * @param[in] y                Global id of current block in y.
+ * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
+ * @{
+ */
+#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
+    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
+#else // defined(PARTIAL_STORE_M0)
+#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
+    ((uint)(y * M0))
+#endif    // defined(PARTIAL_STORE_M0)
+/** @} */ // end of group COMPUTE_M0_START_ROW
+
+/** Store a vector that can only be partial in x.
+ *
+ * @note in case @p vec_size or @p leftover != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to store is expected to end in a 0.
+ * E.g., for basename=c, the expected name is c0.
+ *
+ * @param[in] basename  The name of the variable without trailing 0
+ * @param[in] data_type The data type of the vector
+ * @param[in] ptr       The base pointer
+ * @param[in] vec_size  The vector size if cond = false. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] leftover  The vector size if cond = true. Supported range: [1, @p vec_size0)
+ * @param[in] cond      Condition to select either vec_size0 or vec_size1
+ * @{
+ */
+#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
+    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
+/** @} */ // end of group STORE_VECTOR_SELECT
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pad_layer.cl b/src/core/CL/cl_kernels/pad_layer.cl
index 4e4d2ad9e7..fe71b5d119 100644
--- a/src/core/CL/cl_kernels/pad_layer.cl
+++ b/src/core/CL/cl_kernels/pad_layer.cl
@@ -23,12 +23,12 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH)
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH)
 
 #define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_SELECT VEC_DATA_TYPE(SELECT_DT, VEC_SIZE)
-#define OFFSETS VEC_OFFS(VEC_SELECT, VEC_SIZE)
+#define VEC_SELECT SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSETS VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VEC_SIZE)
 
 #if defined(CONST_VAL)
 /** Perform a pad operation when PaddingMode is CONSTANT
@@ -38,7 +38,6 @@
  * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
  * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
  * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
- * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile flag, e.g. -DSELECT_DT=float
  * @note In case pad left is more than the vector size, the number of threads to skip along the X axis must be passed using the
  *       -DNUM_THREADS_TO_SKIP_X compile flag, e.g. -DNUM_THREADS_TO_SKIP_X=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
  * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
@@ -149,7 +148,6 @@ __kernel void pad_layer_constant(TENSOR3D_DECLARATION(src),
  * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
  * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
  * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
- * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile flag, e.g. -DSELECT_DT=float
  * @note Number of values to the left when operating across left padding must be passed using the -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5
  * @note Number of values to the left when operating across right padding must be passed using the -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6
  * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6
@@ -250,4 +248,4 @@ __kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src),
 #endif // SRC_WIDTH == 1
 }
 #endif // defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
-#endif // defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH)
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
index d623226300..4fa1551b54 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
@@ -97,7 +97,7 @@ __kernel void pixelwise_mul_float(
 #endif /* DATA_TYPE_FLOAT */
 
 #if defined(ACTIVATION_TYPE)
-    vstore16(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, res, A_VAL, B_VAL), 0, (__global DATA_TYPE_OUT *)out.ptr);
+    vstore16(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE, res, A_VAL, B_VAL), 0, (__global DATA_TYPE_OUT *)out.ptr);
 #else  // defined(ACTIVATION_TYPE)
     // Store result
     vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
@@ -150,7 +150,7 @@ __kernel void pixelwise_mul_complex(
     float2 res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
 
 #if defined(ACTIVATION_TYPE)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, float, res, A_VAL, B_VAL), 0, (__global float *)out.ptr);
+    vstore2(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, res, A_VAL, B_VAL), 0, (__global float *)out.ptr);
 #else  // defined(ACTIVATION_TYPE)
     // Store result
     vstore2(res, 0, (__global float *)out.ptr);
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 9e6521b300..00250a08a5 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "helpers.h"
+#include "repeat.h"
 
 #if defined(POOL_AVG) || defined(POOL_L2)
 #define POOL_OP(x, y) ((x) + (y))
@@ -38,8 +39,6 @@
 #define DIV_OP(x, y) (x * (1.f / y))
 #define SQRT_OP(x) sqrt((x))
 
-#define DIV_OP_NHWC(x, y) (x * (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(1.f / y))
-
 #if STRIDE_X == 1
 #define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
 #elif STRIDE_X == 2 /* STRIDE_X == 1 */
@@ -481,122 +480,6 @@ __kernel void pooling_layer_MxN_nchw(
 }
 #endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
 
-ACC_DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
-                                       const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int start_x = get_global_id(1) * stride_x - pad_x;
-#if defined(DST_DEPTH)
-    int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y;
-#else  /* defined(DST_DEPTH) */
-    int       start_y    = get_global_id(2) * stride_y - pad_y;
-#endif /* defined(DST_DEPTH) */
-
-#if !defined(EXCLUDE_PADDING)
-    upper_bound_w += pad_x;
-    upper_bound_h += pad_y;
-#endif /* defined(EXCLUDE_PADDING) */
-    const int end_x = min(start_x + pool_size_x, upper_bound_w);
-    const int end_y = min(start_y + pool_size_y, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to N (NHWC)
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_MxN_nhwc(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    // Get pixels pointer
-#if defined(DST_DEPTH)
-    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-#else  /* defined(DST_DEPTH) */
-    Tensor3D  input      = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D  output     = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* defined(DST_DEPTH) */
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-    vdata = INITIAL_VALUE;
-
-    const int idx_width = get_global_id(1) * STRIDE_X;
-#if defined(DST_DEPTH)
-    const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y;
-#else  /* defined(DST_DEPTH) */
-    const int idx_height = get_global_id(2) * STRIDE_Y;
-#endif /* defined(DST_DEPTH) */
-
-    for(int y = 0; y < POOL_SIZE_Y; ++y)
-    {
-        int y1 = select(y, PAD_Y - idx_height, y + idx_height - PAD_Y < 0 || y + idx_height - PAD_Y >= MAX_HEIGHT);
-        for(int x = 0; x < POOL_SIZE_X; ++x)
-        {
-            int x1 = select(x, PAD_X - idx_width - 1, x + idx_width - PAD_X < 0 || x + idx_width - PAD_X >= MAX_WIDTH);
-            x1     = select(x1, PAD_X - idx_width - 1, y != y1);
-
-#if defined(DST_DEPTH)
-            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
-#else  /* defined(DST_DEPTH) */
-            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-            data0    = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
-#endif /* defined(DST_DEPTH) */
-
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif /* defined(POOL_L2) */
-            vdata = POOL_OP(vdata, CONVERT(data0, VEC_DATA_TYPE(ACC_DATA_TYPE, 8)));
-        }
-    }
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    vdata = DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    vdata = SQRT_OP(vdata);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    vstore8(CONVERT(vdata, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)output.ptr);
-}
-
 #if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
 
 inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
@@ -631,65 +514,6 @@ inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint
     return;
 }
 
-inline void offset_no_padding_nhwc_3D(const Tensor3D *input, uint *offset_x0, uint *offset_x1, uint *offset_x2, uint *offset_x3)
-{
-    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
-
-    const int x = get_global_id(0);
-    const int y = get_global_id(1) * STRIDE_X;
-    const int z = get_global_id(2) * STRIDE_Y;
-
-    //x axis: component, y axis: width, z axis: height
-    const uint padded_offset = input->offset_first_element_in_bytes
-                               + x * 8 * input->stride_x
-                               + y * input->stride_y
-                               + z * input->stride_z;
-
-    const uint offset_base = padded_offset
-                             - (z + 1) * PAD_TENSOR_TOP * input->stride_y    /* Top padding for each z plane */
-                             - y * pad_horiz * sizeof(DATA_TYPE)             /* Horizontal padding for each row */
-                             - z * MAX_WIDTH * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each z plane */
-                             - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
-
-    *offset_x0 = (uint)offset_base / sizeof(DATA_TYPE);
-    *offset_x1 = *offset_x0 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-    *offset_x2 = *offset_x0 + input->stride_z / sizeof(DATA_TYPE) - pad_horiz * MAX_WIDTH - PAD_TENSOR_TOP * input->stride_y / sizeof(DATA_TYPE);
-    *offset_x3 = *offset_x2 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-
-    return;
-}
-
-#if defined(DST_DEPTH)
-inline void offset_no_padding_nhwc_4D(const Tensor4D *input, uint *offset_x0, uint *offset_x1, uint *offset_x2, uint *offset_x3)
-{
-    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
-    const int z_max     = get_global_size(2) / BATCH_SIZE;
-
-    const int x = get_global_id(0);
-    const int y = get_global_id(1) * STRIDE_X;
-    const int z = (get_global_id(2) % z_max) * STRIDE_Y;
-    const int w = get_global_id(2) / z_max;
-
-    const unsigned int padded_offset = input->offset_first_element_in_bytes
-                                       + x * 8 * input->stride_x
-                                       + y * input->stride_y
-                                       + z * input->stride_z;
-
-    const unsigned int offset_base = padded_offset
-                                     - (z + 1) * PAD_TENSOR_TOP * input->stride_y    /* Top padding for each z plane */
-                                     - y * pad_horiz * sizeof(DATA_TYPE)             /* Horizontal padding for each row */
-                                     - z * MAX_WIDTH * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each z plane */
-                                     - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
-
-    *offset_x0 = (uint)offset_base / sizeof(DATA_TYPE);
-    *offset_x1 = *offset_x0 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-    *offset_x2 = *offset_x0 + input->stride_z / sizeof(DATA_TYPE) - pad_horiz * MAX_WIDTH - PAD_TENSOR_TOP * input->stride_y / sizeof(DATA_TYPE);
-    *offset_x3 = *offset_x2 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-
-    return;
-}
-#endif //defined(DST_DEPTH)
-
 #endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
 
 /** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
@@ -832,115 +656,156 @@ __kernel void pooling_layer_2_nchw_indices_fp16(
 #endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
 }
 
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NHWC.
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
  *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
  * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
  *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_stride_w                      Stride of the indices tensor in W dimension (in bytes)
- * @param[in]  indices_step_w                        indices_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void pooling_layer_2_nhwc_indices_fp32(
+__kernel void pooling_layer_MxN_nhwc(
     TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output),
-    TENSOR4D_DECLARATION(indices))
+    TENSOR4D_DECLARATION(output))
 {
-    // Get pixels pointer
-#if defined(DST_DEPTH)
-    Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
-    Tensor4D output  = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-    Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT(indices, DST_DEPTH);
-#else  /* defined(DST_DEPTH) */
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-#endif /* defined(DST_DEPTH) */
-
-#if defined(DST_DEPTH)
-    // Load data
-    float8 data_top0    = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 0, 0, 0));
-    float8 data_top1    = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 1, 0, 0));
-    float8 data_bottom0 = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 0, 1, 0));
-    float8 data_bottom1 = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 1, 1, 0));
-#else  /* defined(DST_DEPTH) */
-    // Load data
-    float8   data_top0    = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
-    float8   data_top1    = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-    float8   data_bottom0 = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 0, 1));
-    float8   data_bottom1 = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 1, 1));
-#endif /* defined(DST_DEPTH) */
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int offset_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+    int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = get_global_id(2) % DST_HEIGHT;
+    int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else //DST_BATCH_SIZE != 1
+    int idx_out_h = get_global_id(2);
+    int idx_out_n = 0;
+#endif // DST_BATCH_SIZE != 1
+
+    int idx_in_w  = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h  = idx_out_h * STRIDE_Y - PAD_Y;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes +
+                                                      offset_c +
+                                                      idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes +
+                                                        offset_c +
+                                                        idx_out_w * output_stride_y +
+                                                        idx_out_h * output_stride_z +
+                                                        idx_out_n * output_stride_w;
+
+#if ((defined(POOL_AVG) || defined(POOL_L2)))
+#if defined(EXCLUDE_PADDING)
+    int filter_size = 0;
+#else // defined(EXCLUDE_PADDING)
+    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif // defined(EXCLUDE_PADDING)
+#endif // ((defined(POOL_AVG) || defined(POOL_L2)))
 
-    float8 data_top_max    = POOL_OP(data_top0, data_top1);
-    float8 data_bottom_max = POOL_OP(data_bottom0, data_bottom1);
-    float8 data_max        = POOL_OP(data_top_max, data_bottom_max);
-    vstore8(data_max, 0, (__global float *)output.ptr);
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
 
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+    for(int y = pool_y_s; y < pool_y_e; ++y)
+    {
+        for(int x = pool_x_s; x < pool_x_e; ++x)
+        {
+            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) data0;
+#if defined(FP_MIXED_PRECISION)
+            // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+            data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else // defined(FP_MIXED_PRECISION)
+            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
 
-    uint offset_x0 = 0;
-    uint offset_x1 = 0;
-    uint offset_x2 = 0;
-    uint offset_x3 = 0;
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif // defined(POOL_L2)
+            res0 = POOL_OP(res0, data0);
 
-#if defined(DST_DEPTH)
-    offset_no_padding_nhwc_4D(&input, &offset_x0, &offset_x1, &offset_x2, &offset_x3);
-#else  /* defined(DST_DEPTH) */
-    offset_no_padding_nhwc_3D(&input, &offset_x0, &offset_x1, &offset_x2, &offset_x3);
-#endif /* defined(DST_DEPTH) */
+#if ((defined(POOL_AVG) || defined(POOL_L2))) && defined(EXCLUDE_PADDING)
+            filter_size++;
+#endif // ((defined(POOL_AVG) || defined(POOL_L2))) && defined(EXCLUDE_PADDING)
+        }
+    }
 
-    uint8 voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3, offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
-    uint8 voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3, offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
-    uint8 voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3, offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
-    uint8 voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3, offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#endif // defined(POOL_AVG) || defined(POOL_L2)
 
-    uint8 index0 = select(voffset_x1, voffset_x0, isgreaterequal(data_top0, data_top1));
-    uint8 index1 = select(voffset_x3, voffset_x2, isgreaterequal(data_bottom0, data_bottom1));
-    uint8 index  = select(index1, index0, isgreaterequal(data_top_max, data_bottom_max));
-    vstore8(index, 0, (__global uint *)indices.ptr);
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
 
-#endif /* defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM */
+    // Store result
+#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
 }
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
 
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NHWC.
+/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
  *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
  * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
  *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -960,79 +825,157 @@ __kernel void pooling_layer_2_nhwc_indices_fp32(
  * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
  * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_stride_w                      Stride of the indices tensor in W dimension (in bytes)
- * @param[in]  indices_step_w                        indices_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ * @param[in]  indices_ptr                           (Optional) Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      (Optional) Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      (Optional) Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      (Optional) Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_stride_w                      (Optional) Stride of the indices tensor in W dimension (in bytes)
+ * @param[in]  indices_step_w                        (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
  */
-__kernel void pooling_layer_2_nhwc_indices_fp16(
+__kernel void pooling_layer_2x2_nhwc(
     TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output),
-    TENSOR4D_DECLARATION(indices))
+    TENSOR4D_DECLARATION(output)
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+    ,
+    TENSOR4D_DECLARATION(indices)
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+)
 {
-    // Get pixels pointer
-#if defined(DST_DEPTH)
-    Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
-    Tensor4D output  = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-    Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT(indices, DST_DEPTH);
-#else  /* defined(DST_DEPTH) */
-    Tensor3D input        = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output       = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Tensor3D indices      = CONVERT_TO_TENSOR3D_STRUCT(indices);
-#endif /* defined(DST_DEPTH) */
-
-#if defined(DST_DEPTH)
-    // Load data
-    half8 data_top0    = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 0, 0, 0));
-    half8 data_top1    = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 1, 0, 0));
-    half8 data_bottom0 = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 0, 1, 0));
-    half8 data_bottom1 = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 1, 1, 0));
-#else  /* defined(DST_DEPTH) */
-    // Load data
-    half8 data_top0    = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
-    half8 data_top1    = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
-    half8 data_bottom0 = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 0, 1));
-    half8 data_bottom1 = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 1, 1));
-#endif /* defined(DST_DEPTH) */
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = get_global_id(2) % DST_HEIGHT;
+    int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else //SRC_BATCH_SIZE != 1
+    int idx_out_h = get_global_id(2);
+    int idx_out_n = 0;
+#endif // SRC_BATCH_SIZE != 1
+
+    int idx_in_w  = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h  = idx_out_h * STRIDE_Y - PAD_Y;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes +
+                                                      idx_out_c * sizeof(DATA_TYPE) +
+                                                      idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes +
+                                                        idx_out_c * sizeof(DATA_TYPE) +
+                                                        idx_out_w * output_stride_y +
+                                                        idx_out_h * output_stride_z +
+                                                        idx_out_n * output_stride_w;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
+
+    int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
+
+    int x0 = pool_x_s + idx_in_w;
+    int y0 = pool_y_s + idx_in_h;
+    int x1 = pool_x_e - 1 + idx_in_w;
+    int y1 = pool_y_e - 1 + idx_in_h;
+
+    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
 
-    half8 data_top_max    = POOL_OP(data_top0, data_top1);
-    half8 data_bottom_max = POOL_OP(data_bottom0, data_bottom1);
-    half8 data_max        = POOL_OP(data_top_max, data_bottom_max);
-    vstore8(data_max, 0, (__global half *)output.ptr);
+#if defined(FP_MIXED_PRECISION)
+    // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+    data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else // defined(FP_MIXED_PRECISION)
+    data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
+    data1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
+    data2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
+    data3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if !defined(POOL_MAX)
+    if(filter_size != 4)
+    {
+        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
+        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
+        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
+        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
+
+        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
+        data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
+        data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
+        data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
+        data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
+    }
+#endif // !defined(POOL_MAX)
 
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 *= data0;
+    data1 *= data1;
+    data2 *= data2;
+    data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = data0;
+    res0 = POOL_OP(res0, data1);
+    res0 = POOL_OP(res0, data2);
+    res0 = POOL_OP(res0, data3);
 
-    uint offset_x0_int = 0;
-    uint offset_x1_int = 0;
-    uint offset_x2_int = 0;
-    uint offset_x3_int = 0;
-
-#if defined(DST_DEPTH)
-    offset_no_padding_nhwc_4D(&input, &offset_x0_int, &offset_x1_int, &offset_x2_int, &offset_x3_int);
-#else  /* defined(DST_DEPTH) */
-    offset_no_padding_nhwc_3D(&input, &offset_x0_int, &offset_x1_int, &offset_x2_int, &offset_x3_int);
-#endif /* defined(DST_DEPTH) */
-
-    ushort offset_x0 = (ushort)offset_x0_int;
-    ushort offset_x1 = (ushort)offset_x1_int;
-    ushort offset_x2 = (ushort)offset_x2_int;
-    ushort offset_x3 = (ushort)offset_x3_int;
-
-    ushort8 voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3, offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
-    ushort8 voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3, offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
-    ushort8 voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3, offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
-    ushort8 voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3, offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
-
-    ushort8 index0 = select(voffset_x1, voffset_x0, isgreaterequal(data_top0, data_top1));
-    ushort8 index1 = select(voffset_x3, voffset_x2, isgreaterequal(data_bottom0, data_bottom1));
-    ushort8 index  = select(index1, index0, isgreaterequal(data_top_max, data_bottom_max));
-    vstore8(CONVERT(index, uint8), 0, (__global uint *)indices.ptr);
-
-#endif /* defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM */
-}
\ No newline at end of file
+#if defined(POOL_AVG) || defined(POOL_L2)
+#if defined(EXCLUDE_PADDING)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#else // !defined(EXCLUDE_PADDING)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
+#endif // defined(EXCLUDE_PADDING)
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+    // Store result
+#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+
+    // This part is used to return the index of the maximum value
+    // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
+
+    // note: Batch dimension does not contribute in the offset contribution
+    VEC_DATA_TYPE(uint, VEC_SIZE) base_index = (uint)idx_out_c;
+
+    base_index += VEC_OFFS(uint, VEC_SIZE);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE) index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE) index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE) index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE) index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+
+    index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
+    index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
+    index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
+
+    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes +
+                                                         idx_out_c * sizeof(uint) +
+                                                         idx_out_w * indices_stride_y +
+                                                         idx_out_h * indices_stride_z +
+                                                         idx_out_n * indices_stride_w;
+
+    // Store result
+    STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index fe13464b1e..d8cef2b4e6 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
@@ -47,8 +47,6 @@
 
 #define DIV_OP(x, y) (x * (1.f / y))
 
-#define DIV_OP_NHWC(x, y) (convert_float8(x) * (float8)(1.f / y))
-
 #if defined(POOL_L2)
 #error "L2 pooling is not supported"
 #endif /* defined(POOL_L2) */
@@ -155,34 +153,22 @@ __kernel void pooling_layer_MxN_quantized_nchw(
     *(__global DATA_TYPE *)output.ptr = result_q8;
 }
 
-int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
-                             const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int start_x = get_global_id(1) * stride_x - pad_x;
-#if defined(DST_DEPTH)
-    int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y;
-#else  /* defined(DST_DEPTH) */
-    int       start_y    = get_global_id(2) * stride_y - pad_y;
-#endif /* defined(DST_DEPTH) */
-
-    const int end_x = min(start_x + pool_size_x, upper_bound_w);
-    const int end_y = min(start_y + pool_size_y, upper_bound_h);
-
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-
-    return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to N (NHWC)
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
  *
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=uchar. Supported data types are QASYMM8/QASYMM8_SIGNED
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=int
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ * @note If the output has be requantized, -DOFFSET_IN1, -DOFFSET_OUT, -DSCALE_IN1 and -DSCALE_OUT muste be passed at compile time
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -209,57 +195,72 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
     TENSOR4D_DECLARATION(input),
     TENSOR4D_DECLARATION(output))
 {
-    // Get pixels pointer
-#if defined(DST_DEPTH)
-    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-#else  /* defined(DST_DEPTH) */
-    Tensor3D  input      = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D  output     = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* defined(DST_DEPTH) */
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int offset_c  = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+    int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = get_global_id(2) % DST_HEIGHT;
+    int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else  //DST_BATCH_SIZE != 1
+    int idx_out_h   = get_global_id(2);
+    int idx_out_n   = 0;
+#endif // DST_BATCH_SIZE != 1
 
-    int8 vdata = INITIAL_VALUE;
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + offset_c + idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + offset_c + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * output_stride_w;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = 0;
+#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif                                               // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
 
-    const int idx_width = get_global_id(1) * STRIDE_X;
-#if defined(DST_DEPTH)
-    const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y;
-#else  /* defined(DST_DEPTH) */
-    const int idx_height = get_global_id(2) * STRIDE_Y;
-#endif /* defined(DST_DEPTH) */
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
 
-    for(int y = 0; y < POOL_SIZE_Y; ++y)
+    for(int y = pool_y_s; y < pool_y_e; ++y)
     {
-        int y1 = select(y, PAD_Y - idx_height, y + idx_height - PAD_Y < 0 || y + idx_height - PAD_Y >= MAX_HEIGHT);
-        for(int x = 0; x < POOL_SIZE_X; ++x)
+        for(int x = pool_x_s; x < pool_x_e; ++x)
         {
-            int x1 = select(x, PAD_X - idx_width - 1, x + idx_width - PAD_X < 0 || x + idx_width - PAD_X >= MAX_WIDTH);
-            x1     = select(x1, PAD_X - idx_width - 1, y != y1);
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            data;
+            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+            data0;
 
-#if defined(DST_DEPTH)
-            VEC_TYPE(8)
-            data = vload8(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
-#else  /* defined(DST_DEPTH) */
-            VEC_TYPE(8)
-            data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
-#endif /* defined(DST_DEPTH) */
+            data  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+            data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
 
-            int8 data0 = convert_int8(data);
-            vdata      = POOL_OP(vdata, data0);
+            res0 = POOL_OP(res0, data0);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+            filter_size++;
+#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
         }
     }
 
 #if defined(POOL_AVG)
-    // Divide by pool region in case of average pooling
-    vdata = convert_int8(round(DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))));
-#endif /* defined(POOL_AVG) */
+    res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
+#endif // defined(POOL_AVG)
 
-    VEC_TYPE(8)
-    out_q8 = CONVERT(vdata, VEC_TYPE(8));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    REQUANTIZE(8, out_q8, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q8);
+    REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0);
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 
     // Store result
-    vstore8(out_q8, 0, (__global DATA_TYPE *)output.ptr);
+    STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
 }
-#endif /* defined(DATA_TYPE) && defined(INITIAL_VALUE) */
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h
index 59bf5b9d8e..bed94a7b3b 100644
--- a/src/core/CL/cl_kernels/repeat.h
+++ b/src/core/CL/cl_kernels/repeat.h
@@ -134,6 +134,10 @@
 #define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
 
 // Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
+#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
+
+// Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
 #define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
 #define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
 
diff --git a/src/core/CL/cl_kernels/select.cl b/src/core/CL/cl_kernels/select.cl
index 52ef81560a..b06a1118a8 100644
--- a/src/core/CL/cl_kernels/select.cl
+++ b/src/core/CL/cl_kernels/select.cl
@@ -23,11 +23,10 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE)
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
 /** This function perform a select operation between two tensors when condition tensor has the same rank.
  *
  * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
- * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
@@ -76,8 +75,8 @@ __kernel void select_same_rank(
     Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load values
-    VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
-    in_c = CONVERT((VLOAD(VEC_SIZE)(0, (__global uchar *)c_t.ptr)), VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE));
+    SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_c = CONVERT((VLOAD(VEC_SIZE)(0, (__global uchar *)c_t.ptr)), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -85,13 +84,12 @@ __kernel void select_same_rank(
 
     // Calculate and store result
     VSTORE(VEC_SIZE)
-    (select(in_y, in_x, in_c > (SELECT_DATA_TYPE)0), 0, (__global DATA_TYPE *)out_t.ptr);
+    (select(in_y, in_x, in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0), 0, (__global DATA_TYPE *)out_t.ptr);
 }
 
 /** This function perform a select operation between two tensors when condition tensor has a different rank.
  *
  * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
- * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
@@ -138,7 +136,7 @@ __kernel void select_different_rank_2(
     Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load values
-    VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+    SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     in_c = *((__global uchar *)(c_t.ptr + c_idx * c_t.stride_x));
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
@@ -147,15 +145,14 @@ __kernel void select_different_rank_2(
 
     // Calculate and store result
     VSTORE(VEC_SIZE)
-    (select(in_y, in_x, in_c > (SELECT_DATA_TYPE)0), 0, (__global DATA_TYPE *)out_t.ptr);
+    (select(in_y, in_x, in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0), 0, (__global DATA_TYPE *)out_t.ptr);
 }
-#endif /* defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) */
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */
 
-#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE)
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE)
 /** This function perform a select operation between two tensors when condition tensor has a different rank.
  *
  * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
- * @attention The select operation data_type need to be passed at compile time using -DSELECT_DATA_TYPE: e.g. -DSELECT_DATA_TYPE=uchar
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
@@ -202,7 +199,7 @@ __kernel void select_different_rank_n(
     Tensor3D out_t = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load values
-    VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+    SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     in_c = *((__global uchar *)(c_t.ptr + c_idx * c_t.stride_x));
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_t.ptr);
@@ -211,6 +208,6 @@ __kernel void select_different_rank_n(
 
     // Calculate and store result
     VSTORE(VEC_SIZE)
-    (select(in_y, in_x, in_c > (SELECT_DATA_TYPE)0), 0, (__global DATA_TYPE *)out_t.ptr);
+    (select(in_y, in_x, in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0), 0, (__global DATA_TYPE *)out_t.ptr);
 }
-#endif /* defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) */
\ No newline at end of file
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 77dbb47e41..01f5de47cf 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -23,55 +23,15 @@
  */
 #include "helpers.h"
 
-#define MAX_OP(x, y, type, size) max((x), (y))
-#define ADD_OP(x, y, type, size) ((x) + (y))
-#define SUB_OP(x, y, type, size) ((x) - (y))
-#define MUL_OP(x, y, type, size) ((x) * (y))
-#define DIV_OP(x, y, type, size) ((x) / (y))
-#define EXP_OP(x, type, size) exp((x))
-
-#ifdef USE_F16
-#define MINVAL -HALF_MAX
-#define SELECT_DATA_TYPE short
-#else /* USE_F16 */
-#define MINVAL -FLT_MAX
-#define SELECT_DATA_TYPE int
-#endif /* USE_F16 */
-
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-/* Vector size, i.e. number of vector elements. */
-#if VECTOR_SIZE == 2
-__constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL);
-__constant uint2 idx__ = (uint2)(0, 1);
-
-#elif VECTOR_SIZE == 4
-__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
-__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
-
-#elif VECTOR_SIZE == 8
-__constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL);
-__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
-
-#else /* VECTOR_SIZE DEFAULT */
-#define VECTOR_SIZE 16
-#define LOG_VECTOR_SIZE 4
-__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
-__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-
-#endif /* VECTOR_SIZE END */
-
-// TODO (COMPMID-661): Remove if the non-fused kernels are removed
-__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
-__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-__constant uint4 idx4   = (uint4)(0, 1, 2, 3);
+#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
 
 /** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
  *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
  *
  * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -103,28 +63,49 @@ __kernel void softmax_layer_norm(
     TENSOR3D_DECLARATION(sum),
     TENSOR3D_DECLARATION(dst))
 {
-    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
     Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
 
     // Load max value of 1D logits vector (row)
     DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
-#ifdef LOG_SOFTMAX
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(LOG_SOFTMAX)
     sum_val = log(sum_val);
-    vstore16(SUB_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
-#else  /* LOG_SOFTMAX */
-    vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
-#endif /* LOG_SOFTMAX */
+    data0 -= sum_val;
+#else  // defined(LOG_SOFTMAX)
+    data0 /= sum_val;
+#endif // defined(LOG_SOFTMAX)
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 
+#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
+
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
+
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+
 /** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
  * then gets the exponent of each element as sums all elements across each row.
  *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
  * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
  * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
+ * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
  *
  * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
  * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
@@ -158,136 +139,102 @@ __kernel void softmax_layer_norm(
  * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
  * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- * @param[in]  width                              Input image width
  */
 __kernel void softmax_layer_max_shift_exp_sum_serial(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(maxo),
     TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum),
-    uint width)
+    TENSOR3D_DECLARATION(sum))
 {
-    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
     Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
     Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
 
 #ifdef BETA
     // Initialize beta
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA;
+    VEC_TYPE beta = (VEC_TYPE)BETA;
 #endif /* BETA */
 
     // Initialize local maximum
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
-
-    // Calculate max of row
-    const uint width_ = width >> LOG_VECTOR_SIZE;
-    for(uint i = 0; i < width_; i++)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-        data_max    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
-        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
-    }
+    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
 
 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
-    VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
-    widx        = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
-    max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
+    VEC_TYPE data    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+    SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
+    max_val_vec      = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx));
 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 
+    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+    {
+        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
+        max_val_vec   = max(data, max_val_vec);
+    }
+
     // Perform max reduction
-#if VECTOR_SIZE == 16
-    max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
-#endif /* VECTOR SIZE 16 END */
-#if VECTOR_SIZE >= 8
-    max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
-#endif /* VECTOR SIZE 8 END */
-#if VECTOR_SIZE >= 4
-    max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
-#endif /* VECTOR SIZE 4 END */
-    max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
-    // Store result
-    *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
+    DATA_TYPE max_val                 = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
+    *((__global DATA_TYPE *)maxo.ptr) = max_val;
 
     /* Second section */
 
-    // Load max value of 1D logits vector (row)
-    DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
-
     // Set sum vector
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    sum1D = 0;
+    VEC_TYPE sum1D = 0;
 
-    // Shift values, exp and sum
-    for(uint i = 0; i < width_; i++)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-        data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
-        data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    data -= max_val;
 #ifdef BETA
-        data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+    data *= beta;
 #endif /* BETA */
 #ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
-        data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+    (data, 0, (__global DATA_TYPE *)dst_addr);
+    data = exp(data);
+    data = select(0, data, widx);
 #else  /* LOG_SOFTMAX */
-        data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+    data = exp(data);
+    data = select(0, data, widx);
+    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+    (data, 0, (__global DATA_TYPE *)dst_addr);
 #endif /* LOG_SOFTMAX */
-        sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
-    }
+    sum1D += data;
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
-    data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+    // Shift values, exp and sum
+    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+    {
+        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
+        data -= max_val;
 #ifdef BETA
-    data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+        data *= beta;
 #endif /* BETA */
 #ifdef LOG_SOFTMAX
-    VSTORE(VECTOR_SIZE)
-    (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
-    data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
-    widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
-    data = select(0, data, widx);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
+        data = exp(data);
 #else  /* LOG_SOFTMAX */
-    data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
-    widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
-    data = select(0, data, widx);
-    VSTORE(VECTOR_SIZE)
-    (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
+        data = exp(data);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
 #endif /* LOG_SOFTMAX */
-    sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+        sum1D += data;
+    }
 
     // Perform sum reduction
-#if VECTOR_SIZE == 16
-    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
-#endif /* VECTOR SIZE 16 END */
-#if VECTOR_SIZE >= 8
-    sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
-#endif /* VECTOR SIZE 8 END */
-#if VECTOR_SIZE >= 4
-    sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
-#endif /* VECTOR SIZE 4 END */
-    sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
-
-    // Calculate and store result
-    *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+    *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
 }
 
 /** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
  * then gets the exponent of each element as sums all elements across each row.
  *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
  * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
  * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
+ * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
  *
  * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
  * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
@@ -321,71 +268,59 @@ __kernel void softmax_layer_max_shift_exp_sum_serial(
  * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
  * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- * @param[in]  width                              Input image width
  */
 __kernel void softmax_layer_max_shift_exp_sum_parallel(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(maxo),
     TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum),
-    uint width)
+    TENSOR3D_DECLARATION(sum))
 {
-    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    const uint lid    = get_local_id(0);
+    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
     Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
     Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
 
-    const uint lid = get_local_id(0);
-
 #ifdef BETA
     // Initialize beta
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
+    VEC_TYPE beta = (VEC_TYPE)BETA;
 #endif /* BETA */
 
     // Define one temporary vector per work-item.
-    __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
+    __local VEC_TYPE tmp_local[GRID_SIZE];
     __local DATA_TYPE max_local;
 
-    __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
-    // Number of elements per work-item.
-    const uint row = width / GRID_SIZE;
+    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
+
     // Number of iterations per work-item.
-    const uint width_ = row >> 2;
+    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
     // Calculate max of row
     uint i = 0;
-    for(; i < width_; i++)
+    for(; i < width; ++i)
     {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        data_max    = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
-        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        max_val_vec       = max(data_max, max_val_vec);
     }
 #ifdef NON_MULTIPLE_OF_GRID_SIZE
     // How many work-items needed to complete the computation.
     //TODO: Optimize this calculation (avoid %).
-    int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
     if(lid < boundary_workitems)
     {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        data_max    = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
-        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        max_val_vec       = max(data_max, max_val_vec);
     }
 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(boundary_workitems == 0)
-    {
-        boundary_workitems = GRID_SIZE;
-        i--;
-    }
-    if(lid == (boundary_workitems - 1))
+    SELECT_TYPE widx;
+    if(lid == 0)
     {
         // Handle non multiple of 4
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
-        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
-        widx        = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
-        max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
+        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+        widx              = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
+        max_val_vec       = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx));
     }
 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 #endif /* NON_MULTIPLE_OF_GRID_SIZE */
@@ -397,7 +332,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 128)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -405,7 +340,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 64)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -413,7 +348,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 32)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -421,7 +356,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 16)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -429,7 +364,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 8)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -437,7 +372,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 4)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -445,99 +380,84 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 2)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     if(lid == 0)
     {
-        max_val_vec     = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
-        max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
-        max_val_vec.s0  = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
-        max_local       = max_val_vec.s0;
+        max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]);
+        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     /* Second section */
 
     // Set sum vector
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    sum1D             = 0;
+    VEC_TYPE  sum1D   = 0;
     DATA_TYPE max_val = max_local;
 
     // Shift values, exp and sum
-    for(i = 0; i < width_; i++)
+    for(i = 0; i < width; ++i)
     {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
-        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        data -= max_val;
 #ifdef BETA
-        data = MUL_OP(data, beta, DATA_TYPE, 4);
+        data *= beta;
 #endif /* BETA */
 #ifdef LOG_SOFTMAX
-        VSTORE(4)
-        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
-        data = EXP_OP(data, DATA_TYPE, 4);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        data = exp(data);
 #else  /* LOG_SOFTMAX */
-        data = EXP_OP(data, DATA_TYPE, 4);
-        VSTORE(4)
-        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+        data = exp(data);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
 #endif /* LOG_SOFTMAX */
-        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+        sum1D += data;
     }
 #ifdef NON_MULTIPLE_OF_GRID_SIZE
     //TODO: Optimize the calculation (avoid %).
-    boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
     if(lid < boundary_workitems)
     {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
-        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        data -= max_val;
 #ifdef BETA
-        data = MUL_OP(data, beta, DATA_TYPE, 4);
+        data *= beta;
 #endif /* BETA */
 #ifdef LOG_SOFTMAX
-        VSTORE(4)
-        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
-        data = EXP_OP(data, DATA_TYPE, 4);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        data = exp(data);
 #else  /* LOG_SOFTMAX */
-        data = EXP_OP(data, DATA_TYPE, 4);
-        VSTORE(4)
-        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+        data = exp(data);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
 #endif /* LOG_SOFTMAX */
-        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+        sum1D += data;
     }
 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(boundary_workitems == 0)
-    {
-        boundary_workitems = GRID_SIZE;
-        i--;
-    }
-    if(lid == (boundary_workitems - 1))
+    if(lid == 0)
     {
         // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
-        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+        data -= max_val;
 #ifdef BETA
-        data = MUL_OP(data, beta, DATA_TYPE, 4);
+        data *= beta;
 #endif /* BETA */
 #ifdef LOG_SOFTMAX
-        VSTORE(4)
-        (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
-        data = EXP_OP(data, DATA_TYPE, 4);
-        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
-        widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+        data = exp(data);
         data = select(0, data, widx);
 #else  /* LOG_SOFTMAX */
-        data = EXP_OP(data, DATA_TYPE, 4);
-        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
-        widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+        data = exp(data);
         data = select(0, data, widx);
-        VSTORE(4)
-        (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
+        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
 #endif /* LOG_SOFTMAX */
-        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+        sum1D += data;
     }
 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 #endif /* NON_MULTIPLE_OF_GRID_SIZE */
@@ -549,7 +469,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 128)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] += tmp_local[lid + 128];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -557,7 +477,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 64)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] += tmp_local[lid + 64];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -565,7 +485,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 32)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] += tmp_local[lid + 32];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -573,7 +493,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 16)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] += tmp_local[lid + 16];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -581,7 +501,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 8)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] += tmp_local[lid + 8];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -589,7 +509,7 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 4)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] += tmp_local[lid + 4];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -597,16 +517,17 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
     {
         if(lid < 2)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+            tmp_local[lid] += tmp_local[lid + 2];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     if(lid == 0)
     {
-        sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
-        // Perform max reduction
-        sum1D.s01                        = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
-        sum1D.s0                         = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
-        *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
+        // Perform sum reduction
+        *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
     }
 }
+
+#endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
+#endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
index 22b8df8f74..b7a6e00dfa 100644
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
@@ -23,67 +23,107 @@
  */
 #include "helpers_asymm.h"
 
-#define MAX_OP(x, y, type, size) max((x), (y))
-#define ADD_OP(x, y, type, size) ((x) + (y))
-#define SUB_OP(x, y, type, size) ((x) - (y))
+#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(DIFF_MIN)
 
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-#if VECTOR_SIZE == 2
-__constant uint2 idx__ = (uint2)(0, 1);
-#define asymm_mult(a, b) ASYMM_MULT(a, b, 2)
-#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 2)
-#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 2)
-
-#elif VECTOR_SIZE == 4
-__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
-#define asymm_mult(a, b) ASYMM_MULT(a, b, 4)
-#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 4)
-#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 4)
-
-#elif VECTOR_SIZE == 8
-__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
-#define asymm_mult(a, b) ASYMM_MULT(a, b, 8)
-#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 8)
-#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 8)
-
-#else /* VECTOR_SIZE DEFAULT */
-#define VECTOR_SIZE 16
-#define LOG_VECTOR_SIZE 4
-__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-#define asymm_mult(a, b) ASYMM_MULT(a, b, 16)
-#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 16)
-#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 16)
-
-#endif /* VECTOR_SIZE END */
-
-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VECTOR_SIZE)
-#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
 #define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
 
-#if defined(DIFF_MIN)
-
-VEC_INT mult_by_quantized_multiplier_serial(VEC_INT data)
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: S32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void softmax_layer_norm_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(dst))
 {
+    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+    // Load max value of 1D logits vector (row)
+    int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
+
+    // It will be better to calculate this in prev layer and pass here as parameter
+    uint    sum_val_u               = convert_uint(sum_val);
+    int     headroom_plus_one       = clz(sum_val_u);
+    int     num_bits_over_unit      = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
+    int     shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
+    VEC_INT shifted_sum_minus_one   = shifted_sum_minus_one_1;
+    VEC_INT shifted_scale           = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, VECTOR_SIZE);
+
+    // It was already calculated in prev layer, should be stored into tmp output and reused
+    VEC_INT data_diff      = VLOAD(VECTOR_SIZE)(0, (__global int *)src_addr);
+    VEC_INT data_diff_mult = data_diff;
 #if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
     if(INPUT_BETA_MULTIPLIER > 1)
     {
-        return asymm_mult(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER);
+        data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
     }
 #endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
-    return data;
+
+    VEC_INT data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+    data         = ASYMM_MULT(shifted_scale, data, VECTOR_SIZE);
+    data         = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, VECTOR_SIZE);
+#ifdef QASYMM8_SIGNED
+    data += (VEC_INT)(MIN_VALUE);
+#endif /* QASYMM8_SIGNED */
+    data           = select(MIN_VALUE, data, data_diff >= (VEC_INT)(DIFF_MIN));
+    VEC_BASE data0 = CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 
-int4 mult_by_quantized_multiplier_parallel(int4 data)
+#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
+
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
+
+#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
+
+VEC_INT mult_by_quantized_multiplier(VEC_INT data)
 {
 #if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
     if(INPUT_BETA_MULTIPLIER > 1)
     {
-        return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 4);
+        return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
     }
 #endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
     return data;
@@ -92,9 +132,15 @@ int4 mult_by_quantized_multiplier_parallel(int4 data)
 /** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
  * then gets the exponent of each element as sums all elements across each row.
  *
- * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note In case the input is not multiple of VECTOR_SIZE -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
  * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
  * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
  *
  * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -128,111 +174,89 @@ int4 mult_by_quantized_multiplier_parallel(int4 data)
  * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
  * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in]  width                             Input image width
  */
 __kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(maxo),
     TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum),
-    uint width)
+    TENSOR3D_DECLARATION(sum))
 {
-    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
     Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
     Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
 
     VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
 
     // Calculate max of row
-    const uint width4 = width >> LOG_VECTOR_SIZE;
-    for(uint i = 0; i < width4; i++)
-    {
-        VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
-        max_val_vec   = MAX_OP(data, max_val_vec, DATA_TYPE, 16);
-    }
-
 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    // Handle non multiple of 16
     VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
-    VEC_BASE data        = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
-    VEC_UCHAR widx       = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_UCHAR);
-    max_val_vec          = MAX_OP(max_val_vec, select(vec_min_val, data, widx), DATA_TYPE, 16);
+    VEC_BASE data        = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+    VEC_INT widx         = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
+    max_val_vec          = max(max_val_vec, select(vec_min_val, data, CONVERT(widx, VEC_BASE)));
 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 
+    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+    {
+        VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
+        max_val_vec   = max(data, max_val_vec);
+    }
+
     // Perform max reduction
-#if VECTOR_SIZE == 16
-    max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
-#endif /* VECTOR SIZE 16 END */
-#if VECTOR_SIZE >= 8
-    max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
-#endif /* VECTOR SIZE 8 END */
-#if VECTOR_SIZE >= 4
-    max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
-#endif /* VECTOR SIZE 4 END */
-    max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
-
-    // Store result
-    *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
+    DATA_TYPE max_local               = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
+    *((__global DATA_TYPE *)maxo.ptr) = max_local;
 
     // Second part
 
     // Load max value of 1D logits vector (row)
-    int max_val = convert_int(*((__global DATA_TYPE *)offset(&maxo, 0, 0)));
+    int max_val = convert_int(max_local);
 
     // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
     VEC_INT sum1D = 0;
 
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_INT data_fp        = CONVERT(data, VEC_INT);
+    VEC_INT data_diff      = data_fp - max_val;
+    VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+    data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+    data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+    (data_diff, 0, (__global int *)dst_addr);
+    data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+    sum1D += select(0, data_fp, widx);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
     // Shift values, exp and sum
-    for(uint i = 0; i < width4; i++)
+    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
     {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
         VEC_INT data_fp        = CONVERT(data, VEC_INT);
         VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
-        data_fp                = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
-        data_fp                = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
+        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
         VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+        (data_diff, 0, (__global int *)(dst_addr + i * sizeof(int)));
         sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
     }
 
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    // Handle non multiple of 16
-    data                   = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
-    VEC_INT data_fp        = CONVERT(data, VEC_INT);
-    VEC_INT data_diff      = data_fp - max_val;
-    VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
-    data_fp                = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
-    data_fp                = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
-    VEC_INT widx_          = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_INT);
-    VSTORE(VECTOR_SIZE)
-    (data_diff, 0, (__global int *)offset(&dst, width4 << LOG_VECTOR_SIZE, 0));
-    data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    sum1D   = sum1D + select(0, data_fp, widx_);
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
     // Perform sum reduction
-#if VECTOR_SIZE == 16
-    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
-#endif /* VECTOR SIZE 16 END */
-#if VECTOR_SIZE >= 8
-    sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
-#endif /* VECTOR SIZE 8 END */
-#if VECTOR_SIZE >= 4
-    sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
-#endif /* VECTOR SIZE 4 END */
-    sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
-
-    // Calculate and store result
-    *((__global int *)sum.ptr) = sum1D.s0;
+    *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
 }
 
 /** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
  * then gets the exponent of each element as sums all elements across each row.
  *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
  * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
  *
  * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
  * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
@@ -266,72 +290,59 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
  * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
  * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- * @param[in]  width                              Input image width
  */
 __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(maxo),
     TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum),
-    uint width)
+    TENSOR3D_DECLARATION(sum))
 {
-    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    const uint lid    = get_local_id(0);
+    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
     Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
     Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
 
-    const uint4 idx4 = (uint4)(0, 1, 2, 3);
-    const uint  lid  = get_local_id(0);
-
     // Define one temporary vector per work-item.
-    __local int4 tmp_local[GRID_SIZE];
+    __local VEC_INT tmp_local[GRID_SIZE];
     __local DATA_TYPE max_local;
 
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    vec_min_val = (VEC_DATA_TYPE(DATA_TYPE, 4))(MIN_VALUE);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    max_val_vec = vec_min_val;
+    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
+    VEC_BASE max_val_vec = vec_min_val;
 
-    // Number of elements per work-item.
-    const uint row = width / GRID_SIZE;
     // Number of iterations per work-item.
-    const uint width_ = row >> 2;
+    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
     // Calculate max of row
     uint i = 0;
-    for(; i < width_; i++)
+    for(; i < width; ++i)
     {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        data_max    = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
-        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        max_val_vec       = max(data_max, max_val_vec);
     }
 #ifdef NON_MULTIPLE_OF_GRID_SIZE
     // How many work-items needed to complete the computation.
     //TODO: Optimize this calculation (avoid %).
-    int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
     if(lid < boundary_workitems)
     {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        data_max    = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
-        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        max_val_vec       = max(data_max, max_val_vec);
     }
 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(boundary_workitems == 0)
-    {
-        boundary_workitems = GRID_SIZE;
-        i--;
-    }
-    if(lid == (boundary_workitems - 1))
+    VEC_INT widx;
+    if(lid == 0)
     {
         // Handle non multiple of 4
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        data_max = vload4(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        widx        = CONVERT((((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width), VEC_DATA_TYPE(DATA_TYPE, 4));
-        max_val_vec = MAX_OP(max_val_vec, select(vec_min_val, data_max, widx), DATA_TYPE, 4);
+        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+        widx              = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
+        max_val_vec       = max(max_val_vec, select(vec_min_val, data_max, CONVERT(widx, VEC_BASE)));
     }
 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 #endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = convert_int4(max_val_vec);
+    tmp_local[lid] = CONVERT(max_val_vec, VEC_INT);
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -339,7 +350,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 128)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
+            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -347,7 +358,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 64)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
+            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -355,7 +366,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 32)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
+            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -363,7 +374,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 16)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
+            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -371,7 +382,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 8)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
+            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -379,7 +390,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 4)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
+            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -387,72 +398,64 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 2)
         {
-            tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
+            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     if(lid == 0)
     {
-        max_val_vec     = MAX_OP(CONVERT((tmp_local[lid + 1]), VEC_DATA_TYPE(DATA_TYPE, 4)), CONVERT((tmp_local[lid]), VEC_DATA_TYPE(DATA_TYPE, 4)), DATA_TYPE, 4);
-        max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
-        max_val_vec.s0  = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
-        max_local       = max_val_vec.s0;
+        max_val_vec = max(CONVERT((tmp_local[lid + 1]), VEC_BASE), CONVERT((tmp_local[lid]), VEC_BASE));
+        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     /* Second section */
 
     // Set sum vector
-    int4 sum1D   = 0;
-    int  max_val = convert_int(max_local);
+    VEC_INT sum1D   = 0;
+    int     max_val = convert_int(max_local);
 
     // Shift values, exp and sum
-    for(i = 0; i < width_; i++)
+    for(i = 0; i < width; ++i)
     {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        data                = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
-        int4 data_fp        = convert_int4(data);
-        int4 data_diff      = data_fp - max_val;
-        int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
-        data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
-        data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
-        vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
+        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        VEC_INT data_fp        = CONVERT(data, VEC_INT);
+        VEC_INT data_diff      = data_fp - max_val;
+        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+        VSTORE(VECTOR_SIZE)
+        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
     }
 #ifdef NON_MULTIPLE_OF_GRID_SIZE
     //TODO: Optimize the calculation (avoid %).
-    boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
     if(lid < boundary_workitems)
     {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        data                = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
-        int4 data_fp        = convert_int4(data);
-        int4 data_diff      = data_fp - max_val;
-        int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
-        data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
-        data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
-        vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
+        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        VEC_INT data_fp        = CONVERT(data, VEC_INT);
+        VEC_INT data_diff      = data_fp - max_val;
+        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+        VSTORE(VECTOR_SIZE)
+        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
     }
 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(boundary_workitems == 0)
-    {
-        boundary_workitems = GRID_SIZE;
-        i--;
-    }
-    if(lid == (boundary_workitems - 1))
+    if(lid == 0)
     {
         // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        data                = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4 + 4, 0));
-        int4 data_fp        = convert_int4(data);
-        int4 data_diff      = data_fp - max_val;
-        int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
-        data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
-        data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
-        int4 widx           = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
-        vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));
-        data_fp = select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));
+        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+        VEC_INT data_fp        = CONVERT(data, VEC_INT);
+        VEC_INT data_diff      = data_fp - max_val;
+        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+        (data_diff, 0, (__global int *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(int)));
+        data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
         data_fp = select(0, data_fp, widx);
         sum1D   = sum1D + data_fp;
     }
@@ -466,7 +469,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 128)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
+            tmp_local[lid] += tmp_local[lid + 128];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -474,7 +477,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 64)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
+            tmp_local[lid] += tmp_local[lid + 64];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -482,7 +485,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 32)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
+            tmp_local[lid] += tmp_local[lid + 32];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -490,7 +493,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 16)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
+            tmp_local[lid] += tmp_local[lid + 16];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -498,7 +501,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 8)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
+            tmp_local[lid] += tmp_local[lid + 8];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -506,7 +509,7 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 4)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
+            tmp_local[lid] += tmp_local[lid + 4];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
@@ -514,88 +517,16 @@ __kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
     {
         if(lid < 2)
         {
-            tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
+            tmp_local[lid] += tmp_local[lid + 2];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     if(lid == 0)
     {
-        sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], int, 4);
-        // Perform max reduction
-        sum1D.s01                  = ADD_OP(sum1D.s01, sum1D.s23, int, 2);
-        sum1D.s0                   = ADD_OP(sum1D.s0, sum1D.s1, int, 1);
-        *((__global int *)sum.ptr) = sum1D.s0;
+        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
+        // Perform sum reduction
+        *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
     }
 }
-
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
- *
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: S32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void softmax_layer_norm_quantized(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(dst))
-{
-    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
-    // Load max value of 1D logits vector (row)
-    int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
-
-    // It will be better to calculate this in prev layer and pass here as parameter
-    uint  sum_val_u               = convert_uint(sum_val);
-    int   headroom_plus_one       = clz(sum_val_u);
-    int   num_bits_over_unit      = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
-    int   shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
-    int16 shifted_sum_minus_one   = shifted_sum_minus_one_1;
-    int16 shifted_scale           = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);
-
-    // It was already calculated in prev layer, should be stored into tmp output and reused
-    int16 data_diff      = vload16(0, (__global int *)offset(&src, 0, 0));
-    int16 data_diff_mult = data_diff;
-#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
-    if(INPUT_BETA_MULTIPLIER > 1)
-    {
-        data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 16);
-    }
-#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
-
-    int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);
-    data       = ASYMM_MULT(shifted_scale, data, 16);
-    data       = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);
-#ifdef QASYMM8_SIGNED
-    data = ADD_OP(data, (int16)(MIN_VALUE), int, 16);
-#endif /* QASYMM8_SIGNED */
-    data = select(MIN_VALUE, data, data_diff >= (int16)(DIFF_MIN));
-    vstore16(CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
-}
-
-#endif /* defined(DIFF_MIN) */
+#endif // #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
+#endif /* defined(DATA_TYPE) && defined(DIFF_MIN) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(MIN_VALUE) */
diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/space_to_batch.cl
index 5ade9c5a7c..cb11786ac4 100644
--- a/src/core/CL/cl_kernels/space_to_batch.cl
+++ b/src/core/CL/cl_kernels/space_to_batch.cl
@@ -46,8 +46,6 @@
  * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
  * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
  * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_stride_y                      Stride of the block shape tensor in Y dimension (in bytes)
- * @param[in]  block_shape_step_y                        block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
  * @param[in]  batch_id                                  The output tensor batch id
  * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
@@ -117,8 +115,6 @@ __kernel void space_to_batch_nchw(
  * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
  * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
  * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_stride_y                      Stride of the block shape tensor in Y dimension (in bytes)
- * @param[in]  block_shape_step_y                        block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
  * @param[in]  batch_id                                  The output tensor batch id
  * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
index 48a4e0d399..5e5b737785 100644
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_input_transform.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,50 @@
  */
 #include "helpers.h"
 
+#define FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(datatype, basename, y_cond, z_cond)                                         \
+    ({                                                                                                              \
+        basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s0) && (z_cond))); \
+        basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s1) && (z_cond))); \
+        basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s2) && (z_cond))); \
+        basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s3) && (z_cond))); \
+        basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))(((y_cond##1).s0) && (z_cond))); \
+        basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))(((y_cond##1).s1) && (z_cond))); \
+    })
+
+#define FILL_ZERO_OUT_OF_BOUND_6_NHWC_V(datatype, basename, y_cond, z_cond)                                         \
+    ({                                                                                                              \
+        basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s0))); \
+        basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s1))); \
+        basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s2))); \
+        basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s3))); \
+        basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##1).s0))); \
+        basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##1).s1))); \
+    })
+
+#define FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(datatype, basename, y_cond, z_cond)                                         \
+    ({                                                                                                              \
+        basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s0) && (z_cond))); \
+        basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s1) && (z_cond))); \
+        basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s2) && (z_cond))); \
+        basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s3) && (z_cond))); \
+        basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s4) && (z_cond))); \
+        basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s5) && (z_cond))); \
+        basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s6) && (z_cond))); \
+        basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype))(((y_cond##0).s7) && (z_cond))); \
+    })
+
+#define FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(datatype, basename, y_cond, z_cond)                                         \
+    ({                                                                                                              \
+        basename##0 = select((datatype)0, basename##0, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s0))); \
+        basename##1 = select((datatype)0, basename##1, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s1))); \
+        basename##2 = select((datatype)0, basename##2, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s2))); \
+        basename##3 = select((datatype)0, basename##3, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s3))); \
+        basename##4 = select((datatype)0, basename##4, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s4))); \
+        basename##5 = select((datatype)0, basename##5, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s5))); \
+        basename##6 = select((datatype)0, basename##6, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s6))); \
+        basename##7 = select((datatype)0, basename##7, (SELECT_DATA_TYPE(datatype))((y_cond) && ((z_cond##0).s7))); \
+    })
+
 #define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                     \
     ({                                                              \
         comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6;            \
@@ -945,51 +989,54 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     uint src_stride_w,
     uint dst_stride_w)
 {
+    // Index channel
     const int x = get_global_id(0);
+    // Index width
     const int y = get_global_id(1);
 #if defined(NUM_TILES_Y)
+    // Index height
     const int z = get_global_id(2) % NUM_TILES_Y;
+    // Index batch size
     const int b = get_global_id(2) / NUM_TILES_Y;
-#else  /* defined(NUM_TILES_Y) */
-    const int z               = get_global_id(2);
-#endif /* defined(NUM_TILES_Y) */
+#else  // defined(NUM_TILES_Y)
+    // Index height
+    const int z              = get_global_id(2);
+#endif // defined(NUM_TILES_Y)
 
 #if defined(NUM_TILES_Y)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
-#else  /* defined(NUM_TILES_Y) */
-    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
-#endif /* defined(NUM_TILES_Y) */
+#else  // defined(NUM_TILES_Y)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+#endif // defined(NUM_TILES_Y)
 
-    // Clamp coordinates. This clamp is valid for all rows
+    // Origin coordinates for the width (y) and height (z) in the input tensor
     int4 y_coord0 = (int4)(y * OUTPUT_TILE_W) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;
     int2 y_coord1 = (int2)(y * OUTPUT_TILE_W) + (int2)(4, 5) - (int2)PAD_LEFT;
-    y_coord0      = clamp(y_coord0, (int4) - 1, (int4)SRC_DIM_1);
-    y_coord1      = clamp(y_coord1, (int2) - 1, (int2)SRC_DIM_1);
+    int4 z_coord0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
+    int2 z_coord1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
 
-    int  z_coord;
-    int4 valid_y0;
-    int2 valid_y1;
+    // Coordinates to use to avoid out-of-bound reads
+    int4 y_coord_valid0 = clamp(y_coord0, (int4)0, (int4)((int)SRC_DIM_1 - 1));
+    int2 y_coord_valid1 = clamp(y_coord1, (int2)0, (int2)((int)SRC_DIM_1 - 1));
+    int4 z_coord_valid0 = clamp(z_coord0, (int4)0, (int4)((int)SRC_DIM_2 - 1));
+    int2 z_coord_valid1 = clamp(z_coord1, (int2)0, (int2)((int)SRC_DIM_2 - 1));
 
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row4
-    z_coord = (z * 4) - (int)PAD_TOP + 4;
+    // Boundary conditions
+    int4 y_cond0 = y_coord_valid0 == y_coord0;
+    int2 y_cond1 = y_coord_valid1 == y_coord1;
+    int4 z_cond0 = z_coord_valid0 == z_coord0;
+    int2 z_cond1 = z_coord_valid1 == z_coord1;
 
-    // If z < 0, set y to -1
-    valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
-    valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
-    // If z >= SRC_DIM_2, set y to SRC_DIM_2
-    valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
-    valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
-    // Clamp z coordinate
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+    DATA_TYPE d40 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
+    DATA_TYPE d41 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
+    DATA_TYPE d42 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
+    DATA_TYPE d43 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
+    DATA_TYPE d44 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
+    DATA_TYPE d45 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
 
-    DATA_TYPE d40 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d41 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d42 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d43 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d44 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d45 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d4, y_cond, z_cond1.s0);
 
     DATA_TYPE k0 = d44;
     DATA_TYPE k1 = d44;
@@ -1007,44 +1054,24 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row0
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
-
-#if PAD_TOP != 0
-    valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
-    valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
-    valid_y0 = select(valid_y0, (int)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
-    valid_y1 = select(valid_y1, (int)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
-    z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-#else  // PAD_TOP != 0
-    valid_y0 = y_coord0;
-    valid_y1 = y_coord1;
-#endif // if PAD_TOP == 0, we cannot read out of bound
-
-    DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d0, y_cond, z_cond0.s0);
+
 #else  // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    int4            z_coords0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
-    int2            z_coords1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
-
-    valid_y0 = select((int4)y_coord0.s0, (int4) - 1, z_coords0 < (int4)0);
-    valid_y1 = select((int2)y_coord0.s0, (int2) - 1, z_coords1 < (int2)0);
-    valid_y0 = select(valid_y0, (int4)SRC_DIM_1, z_coords0 >= (int4)SRC_DIM_2);
-    valid_y1 = select(valid_y1, (int2)SRC_DIM_1, z_coords1 >= (int2)SRC_DIM_2);
-
-    z_coords0 = clamp((int4)z_coords0, (int4)0, (int4)((int)SRC_DIM_2 - 1));
-    z_coords1 = clamp((int2)z_coords1, (int2)0, (int2)((int)SRC_DIM_2 - 1));
-
-    DATA_TYPE d00                              = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);
-    DATA_TYPE d01                              = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);
-    DATA_TYPE d02                              = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coords0.s2 * src_stride_z);
-    DATA_TYPE d03                              = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coords0.s3 * src_stride_z);
-    DATA_TYPE d04                              = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coords1.s0 * src_stride_z);
-    DATA_TYPE d05                              = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coords1.s1 * src_stride_z);
+    DATA_TYPE d00            = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    DATA_TYPE d01            = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    DATA_TYPE d02            = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    DATA_TYPE d03            = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    DATA_TYPE d04            = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s0 * src_stride_z);
+    DATA_TYPE d05            = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_6_NHWC_V(DATA_TYPE, d0, y_cond0.s0, z_cond);
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
     DATA_TYPE out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;
@@ -1055,20 +1082,14 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     DATA_TYPE out5 = 16.0f * d01 - 20.0f * d03 + 4.0f * d05;
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row2
-    z_coord  = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;
-    valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
-    valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
-    valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
-    valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
-    z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    DATA_TYPE d20 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d21 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d22 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d23 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d24 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d25 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d20 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    DATA_TYPE d21 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    DATA_TYPE d22 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    DATA_TYPE d23 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    DATA_TYPE d24 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    DATA_TYPE d25 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d2, y_cond, z_cond0.s2);
 
     out0 += k0;
     out1 += k1;
@@ -1113,9 +1134,9 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     // Compute destination address
 #if defined(NUM_TILES_Y)
     __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
-#else  /* defined(NUM_TILES_Y) */
+#else  // defined(NUM_TILES_Y)
     __global DATA_TYPE *dst_addr               = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
-#endif /* defined(NUM_TILES_Y) */
+#endif // defined(NUM_TILES_Y)
 
     uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
 
@@ -1133,34 +1154,22 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     dst_addr += dst_plane_stride;
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row1
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;
-    // Row1 can never be out of bounds
-    valid_y0 = y_coord0;
-    valid_y1 = y_coord1;
-
-    DATA_TYPE d10 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d11 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d12 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d13 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d14 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d15 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
-
-    // Row3
-    z_coord  = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
-    valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
-    valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
-    valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
-    valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
-    z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-    z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    DATA_TYPE d30 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d31 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d32 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d33 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d34 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d35 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d10 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    DATA_TYPE d11 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    DATA_TYPE d12 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    DATA_TYPE d13 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    DATA_TYPE d14 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    DATA_TYPE d15 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+
+    DATA_TYPE d30 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    DATA_TYPE d31 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    DATA_TYPE d32 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    DATA_TYPE d33 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    DATA_TYPE d34 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    DATA_TYPE d35 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d1, y_cond, z_cond0.s1);
+    FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d3, y_cond, z_cond0.s3);
 
     // Compute common parts for the channels between [6, 29]
     // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
@@ -1270,20 +1279,14 @@ __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     dst_addr += dst_plane_stride;
 
     // Row5
-    z_coord  = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;
-    valid_y0 = select(y_coord0, (int4) - 1, (int4)z_coord < 0);
-    valid_y1 = select(y_coord1, (int2) - 1, (int2)z_coord < 0);
-    valid_y0 = select(valid_y0, (int4)SRC_DIM_1, (int4)z_coord >= (int)SRC_DIM_2);
-    valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
-    z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-    z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    DATA_TYPE d50 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d51 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d52 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d53 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d54 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    DATA_TYPE d55 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d50 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
+    DATA_TYPE d51 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
+    DATA_TYPE d52 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
+    DATA_TYPE d53 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
+    DATA_TYPE d54 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s0 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
+    DATA_TYPE d55 = *(__global DATA_TYPE *)(src_addr + y_coord_valid1.s1 * (int)src_stride_y + z_coord_valid1.s1 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_6_NHWC_H(DATA_TYPE, d5, y_cond, z_cond1.s1);
 
     // Channels [30, 35]
     out0 = 16.0f * d10 - 20.0f * d12 - 20.0f * d30 + 25.0f * d32 + 4.0f * d50 - 5.0f * d52 + d54 + 4.0f * d14 - 5.0f * d34;
@@ -1350,37 +1353,44 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
 #if defined(NUM_TILES_Y)
     const int z = get_global_id(2) % NUM_TILES_Y;
     const int b = get_global_id(2) / NUM_TILES_Y;
-#else  /* defined(NUM_TILES_Y) */
+#else  // defined(NUM_TILES_Y)
     const int                                z = get_global_id(2);
-#endif /* defined(NUM_TILES_Y) */
+#endif // defined(NUM_TILES_Y)
 
     // Compute input address
 #if defined(NUM_TILES_Y)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
-#else  /* defined(NUM_TILES_Y) */
+#else  // defined(NUM_TILES_Y)
     __global uchar *src_addr                   = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
-#endif /* defined(NUM_TILES_Y) */
+#endif // defined(NUM_TILES_Y)
 
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    // Clamp coordinates. This clamp is valid for all rows
-    int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
-    y_coord      = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
+    // Origin coordinates for the width (y) and height (z) in the input tensor
+    int8 y_coord0 = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
+    int8 z_coord0 = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
 
-    // Row0
-    // We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels
-    int z_coord = z * OUTPUT_TILE_H;
+    // Coordinates to use to avoid out-of-bound reads
+    int8 y_coord_valid0 = clamp(y_coord0, (int8)0, (int8)((int)SRC_DIM_1 - 1));
+    int8 z_coord_valid0 = clamp(z_coord0, (int8)0, (int8)((int)SRC_DIM_2 - 1));
+
+    // Boundary conditions
+    int8 y_cond0 = y_coord_valid0 == y_coord0;
+    int8 z_cond0 = z_coord_valid0 == z_coord0;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
 
     // Load the input tile
     VEC_DATA_TYPE(DATA_TYPE, 8)
     in_row0;
-    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
 
     // Calculate common factors for intermediate tensor
     VEC_DATA_TYPE(DATA_TYPE, 8)
@@ -1394,27 +1404,20 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
     OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
 
 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    // We can skip the border clamping along the y dimension as we cannot read out-of-bound in case of 1x5 kernels
-    int y_coord = y * (int)OUTPUT_TILE_W;
-
-    // Row0
-    // We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels
-    int8 z_coord = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
-    int8 valid_y = select((int8)y_coord, (int8) - 1, z_coord < (int8)0);         // If z < 0, set y to -1
-    valid_y      = select(valid_y, (int8)SRC_DIM_1, z_coord >= (int8)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
-    z_coord      = clamp(z_coord, (int8)0, (int8)SRC_DIM_2 - 1);                 // Clamp z coordinate
 
     // Load the input tile
     VEC_DATA_TYPE(DATA_TYPE, 8)
     in_row0;
-    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord.s0 * src_stride_z);
-    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord.s1 * src_stride_z);
-    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord.s2 * src_stride_z);
-    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord.s3 * src_stride_z);
-    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord.s4 * src_stride_z);
-    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord.s5 * src_stride_z);
-    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord.s6 * src_stride_z);
-    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord.s7 * src_stride_z);
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(DATA_TYPE, in_row0.s, y_cond0.s0, z_cond);
 
     // Calculate common factors for intermediate tensor
     VEC_DATA_TYPE(DATA_TYPE, 8)
@@ -1430,130 +1433,101 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
     VEC_DATA_TYPE(DATA_TYPE, 8)
     in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
 
-    // Clamp coordinates. This clamp is valid for all rows
-    int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
-    y_coord      = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
-
     // Row0
-    int  z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
-    int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);                    // If z < 0, set y to -1
-    valid_y      = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
-    z_coord      = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);                             // Clamp z coordinate
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
 
-    // Load the input tile
-    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
 
     // Row1
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row1.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row1.s, y_cond, z_cond0.s1);
 
     // Row2
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row2.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row2.s, y_cond, z_cond0.s2);
 
     // Row3
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row3.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row3.s, y_cond, z_cond0.s3);
 
     // Row4
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 4;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row4.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row4.s, y_cond, z_cond0.s4);
 
     // Row5
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row5.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row5.s, y_cond, z_cond0.s5);
 
     // Row6
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 6;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row6.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row6.s, y_cond, z_cond0.s6);
 
     // Row7
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 7;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row7.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row7.s, y_cond, z_cond0.s7);
 
     VEC_DATA_TYPE(DATA_TYPE, 8)
     comm_fact0 = in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
@@ -1722,29 +1696,33 @@ __kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
 #endif /* defined(NUM_TILES_Y) */
 
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    // Origin coordinates for the width (y) and height (z) in the input tensor
+    int8 y_coord0 = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
+    int8 z_coord0 = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
+
+    // Coordinates to use to avoid out-of-bound reads
+    int8 y_coord_valid0 = clamp(y_coord0, (int8)0, (int8)((int)SRC_DIM_1 - 1));
+    int8 z_coord_valid0 = clamp(z_coord0, (int8)0, (int8)((int)SRC_DIM_2 - 1));
 
-    // Clamp coordinates. This clamp is valid for all rows
-    int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
-    y_coord      = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
+    // Boundary conditions
+    int8 y_cond0 = y_coord_valid0 == y_coord0;
+    int8 z_cond0 = z_coord_valid0 == z_coord0;
 
-    // Clamp coordinates. This clamp is valid for all columns
-    int  z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
-    int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);                    // If z < 0, set y to -1
-    valid_y      = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
-    z_coord      = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
 
     // Load the input tile
     VEC_DATA_TYPE(DATA_TYPE, 8)
     in_row0;
-    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
 
     VEC_DATA_TYPE(DATA_TYPE, 8)
     out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
@@ -1758,27 +1736,19 @@ __kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
     OUTPUT_ROW_2x2_7x7(out0, tmp0, comm_fact0);
 
 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    // We can skip the border clamping along the y dimension as we cannot read out-of-bound in case of 1x5 kernels
-    int y_coord = y * (int)OUTPUT_TILE_W;
-
-    // Row0
-    // We can skip the border clamping along the z dimension as we cannot read out-of-bound in case of 5x1 kernels
-    int8 z_coord = (int8)(z * OUTPUT_TILE_H) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_TOP;
-    int8 valid_y = select((int8)y_coord, (int8) - 1, z_coord < (int8)0);         // If z < 0, set y to -1
-    valid_y      = select(valid_y, (int8)SRC_DIM_1, z_coord >= (int8)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
-    z_coord      = clamp(z_coord, (int8)0, (int8)SRC_DIM_2 - 1);                 // Clamp z coordinate
-
     // Load the input tile
     VEC_DATA_TYPE(DATA_TYPE, 8)
     in_row0;
-    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord.s0 * (int)src_stride_z);
-    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord.s1 * (int)src_stride_z);
-    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord.s2 * (int)src_stride_z);
-    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord.s3 * (int)src_stride_z);
-    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord.s4 * (int)src_stride_z);
-    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord.s5 * (int)src_stride_z);
-    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord.s6 * (int)src_stride_z);
-    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord.s7 * (int)src_stride_z);
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_V(DATA_TYPE, in_row0.s, y_cond0.s0, z_cond);
 
     // Calculate common factors for intermediate tensor
     VEC_DATA_TYPE(DATA_TYPE, 8)
@@ -1795,130 +1765,101 @@ __kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
     VEC_DATA_TYPE(DATA_TYPE, 8)
     in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
 
-    // Clamp coordinates. This clamp is valid for all rows
-    int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
-    y_coord      = clamp(y_coord, (int8) - 1, (int8)SRC_DIM_1);
-
     // Row0
-    int  z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 0;
-    int8 valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);                    // If z < 0, set y to -1
-    valid_y      = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
-    z_coord      = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);                             // Clamp z coordinate
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s0 * src_stride_z);
 
-    // Load the input tile
-    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row0.s, y_cond, z_cond0.s0);
 
     // Row1
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row1.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row1.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row1.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row1.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row1.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row1.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row1.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row1.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row1.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+    in_row1.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s1 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row1.s, y_cond, z_cond0.s1);
 
     // Row2
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row2.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row2.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row2.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row2.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row2.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row2.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row2.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row2.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row2.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+    in_row2.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s2 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row2.s, y_cond, z_cond0.s2);
 
     // Row3
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row3.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row3.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row3.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row3.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row3.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row3.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row3.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row3.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row3.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+    in_row3.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s3 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row3.s, y_cond, z_cond0.s3);
 
     // Row4
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 4;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row4.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row4.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row4.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row4.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row4.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row4.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row4.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row4.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row4.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+    in_row4.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s4 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row4.s, y_cond, z_cond0.s4);
 
     // Row5
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row5.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row5.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row5.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row5.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row5.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row5.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row5.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row5.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row5.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+    in_row5.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s5 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row5.s, y_cond, z_cond0.s5);
 
     // Row6
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 6;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row6.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row6.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row6.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row6.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row6.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row6.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row6.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row6.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row6.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+    in_row6.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s6 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row6.s, y_cond, z_cond0.s6);
 
     // Row7
-    z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 7;
-    valid_y = select(y_coord, (int8) - 1, (int8)z_coord < 0);
-    valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
-    z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
-
-    in_row7.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row7.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row7.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row7.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row7.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row7.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row7.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * (int)src_stride_z);
-    in_row7.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * (int)src_stride_z);
+    in_row7.s0 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s0 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s1 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s1 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s2 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s2 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s3 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s3 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s4 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s4 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s5 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s5 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s6 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s6 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+    in_row7.s7 = *(__global DATA_TYPE *)(src_addr + y_coord_valid0.s7 * (int)src_stride_y + z_coord_valid0.s7 * src_stride_z);
+
+    FILL_ZERO_OUT_OF_BOUND_8_NHWC_H(DATA_TYPE, in_row7.s, y_cond, z_cond0.s7);
 
     VEC_DATA_TYPE(DATA_TYPE, 8)
     comm_fact0 = (DATA_TYPE)36.0f * in_row2 - (DATA_TYPE)13.0f * in_row4 + in_row6;
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
index efd8502657..0a7b5f50b2 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_output_transform.cl
@@ -158,11 +158,11 @@ __kernel void winograd_output_transform_2x2_3x3_nchw(
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     const VEC_DATA_TYPE(DATA_TYPE, 2)
-    out0_dt                                            = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
+    out0_dt                                            = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
     *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
     *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
             (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
@@ -172,7 +172,7 @@ __kernel void winograd_output_transform_2x2_3x3_nchw(
     out10 += (DATA_TYPE)b;
     out11 += (DATA_TYPE)b;
 #endif // defined(HAS_BIAS)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
             (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
@@ -188,6 +188,8 @@ __kernel void winograd_output_transform_2x2_3x3_nchw(
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
  * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
@@ -238,12 +240,11 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc(
     int batch = get_global_id(2) / SRC_DEPTH;
 #endif /* defined(SRC_DEPTH) */
 
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
+    __global unsigned char *dst_base_ptr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE);
 
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-#endif /* defined(SRC_DEPTH) */
+#if defined(SRC_DEPTH)
+    dst_base_ptr += batch * dst_stride_w;
+#endif // defined(SRC_DEPTH)
 
     // Load the values across the channels to compose the input tile
     DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
@@ -272,25 +273,32 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc(
 
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Get output address
-#if defined(SRC_DEPTH)
-    int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
-#else                                                                         /* defined(SRC_DEPTH) */
-    int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
-#endif                                                                        /* defined(SRC_DEPTH) */
-    offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+
+    dst_base_ptr += y_out * dst_stride_y;
+
+    int2 offset_z = min((int2)z_out + (int2)(0, 1), (int2)((int)DST_HEIGHT - 1)) * (int2)dst_stride_z;
 
     VEC_DATA_TYPE(DATA_TYPE, 2)
-    out0_dt                                      = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
-    *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
-    *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
+    out0_dt                                      = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
+
+    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
+    // is overwritten with the valid one
+    *(__global DATA_TYPE *)(dst_base_ptr + offset_z.s1) = out0_dt.s1;
+    *(__global DATA_TYPE *)(dst_base_ptr + offset_z.s0) = out0_dt.s0;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Get output address
-    int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+
+    dst_base_ptr += z_out * dst_stride_z;
+
+    int2 offset_y = min((int2)y_out + (int2)(0, 1), (int2)((int)DST_WIDTH - 1)) * (int2)dst_stride_y;
+
     VEC_DATA_TYPE(DATA_TYPE, 2)
-    out0_dt                                                      = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
-    *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
-    *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL,
+                         B_VAL);
+
+    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
+    // is overwritten with the valid one
+    *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s1) = out0_dt.s1;
+    *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s0) = out0_dt.s0;
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -387,26 +395,22 @@ __kernel void winograd_output_transform_2x2_7x7_nhwc(
     out_col1 += (VEC_DATA_TYPE(float, 2))b;
 
 #endif // defined(HAS_BIAS)
-    // Get output address
-#if defined(SRC_DEPTH)
-    int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
-#else  /* defined(SRC_DEPTH) */
-    int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
-#endif /* defined(SRC_DEPTH) */
-    offset      = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
-    int2 mult_y = min((int2)dst_size - offset, (int2)1);                           // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
+
+    int2 offset_y = min((int2)y_out + (int2)(0, 1), (int2)((int)DST_WIDTH - 1)) * (int2)dst_stride_y;
+    int2 offset_z = min((int2)z_out + (int2)(0, 1), (int2)((int)DST_HEIGHT - 1)) * (int2)dst_stride_z;
 
     // Store the output tile
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
+    out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
+    out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
 
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
-
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1)     = out_col0_dt.s1;
-    *(__global     DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
+    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
+    // is overwritten with the valid one
+    *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s1) = out_col1_dt.s1;
+    *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s0) = out_col1_dt.s0;
+    *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s1) = out_col0_dt.s1;
+    *(__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s0) = out_col0_dt.s0;
 
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
@@ -599,14 +603,14 @@ __kernel void winograd_output_transform_4x4_3x3_nchw(
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt                                                = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
-                                                                        B_VAL);
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
+                         B_VAL);
     *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
     *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
     *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
     *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
             (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
@@ -628,11 +632,11 @@ __kernel void winograd_output_transform_4x4_3x3_nchw(
     out32 += (float)b;
     out33 += (float)b;
 #endif // defined(HAS_BIAS)
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
             (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
             (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
             (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
@@ -642,6 +646,8 @@ __kernel void winograd_output_transform_4x4_3x3_nchw(
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
  * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
@@ -830,71 +836,82 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
 
 #endif // defined(HAS_BIAS)
 
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    __global unsigned char *dst_base_ptr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE);
+
 #if defined(SRC_DEPTH)
-    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
-#else                                                                               /* defined(SRC_DEPTH) */
-    int4       offset                                            = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
-#endif                                                                              /* defined(SRC_DEPTH) */
-    offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+    dst_base_ptr += batch * dst_stride_w;
+#endif // defined(SRC_DEPTH)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    dst_base_ptr += y_out * dst_stride_y;
+
+    int4 offset_z = min((int4)z_out + (int4)(0, 1, 2, 3), (int4)((int)DST_HEIGHT - 1)) * (int4)dst_stride_z;
 
     // Store the 1x4 output tile
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt                                        = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
-    *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;
-    *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = out0_dt.s3;
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
+                         B_VAL);
+
+    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
+    // is overwritten with the valid one
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s3)) = out0_dt.s3;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s2)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s1)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s0)) = out0_dt.s0;
+
 #elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
-    // Store the 4x1 output tile
-    int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-    int mult_y = min(dst_size - offset, 1);
+
+    dst_base_ptr += z_out * dst_stride_z;
+
+    int4 offset_y = min((int4)y_out + (int4)(0, 1, 2, 3), (int4)((int)DST_WIDTH - 1)) * (int4)dst_stride_y;
 
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)),
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)),
                          A_VAL, B_VAL);
-    *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out0_dt.s0;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out0_dt.s3;
+
+    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
+    // is overwritten with the valid one
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3)) = out0_dt.s3;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0)) = out0_dt.s0;
+
 #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
-    // Get output address
-#if defined(SRC_DEPTH)
-    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
-#else  /* defined(SRC_DEPTH) */
-    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
-#endif /* defined(SRC_DEPTH) */
-    offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
-    int4 mult_y = min((int4)dst_size - offset, (int4)1);                                 // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
+
+    int4 offset_y = min((int4)y_out + (int4)(0, 1, 2, 3), (int4)((int)DST_WIDTH - 1)) * (int4)dst_stride_y;
+    int4 offset_z = min((int4)z_out + (int4)(0, 1, 2, 3), (int4)((int)DST_HEIGHT - 1)) * (int4)dst_stride_z;
 
     // Store the 4x4 output tile
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    out1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    out2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33),
-                                                             VEC_DATA_TYPE(DATA_TYPE, 4)),
+    out3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33),
+                                                                       VEC_DATA_TYPE(DATA_TYPE, 4)),
                          A_VAL, B_VAL);
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out0_dt.s3;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out1_dt.s0;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out1_dt.s1;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out1_dt.s2;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out1_dt.s3;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out2_dt.s0;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out2_dt.s1;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out2_dt.s2;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out2_dt.s3;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out3_dt.s0;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out3_dt.s1;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out3_dt.s2;
-    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out3_dt.s3;
 
+    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
+    // is overwritten with the valid one
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s3)) = out3_dt.s3;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s3)) = out3_dt.s2;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s3)) = out3_dt.s1;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s3)) = out3_dt.s0;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s2)) = out2_dt.s3;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s2)) = out2_dt.s2;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s2)) = out2_dt.s1;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s2)) = out2_dt.s0;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s1)) = out1_dt.s3;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s1)) = out1_dt.s2;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s1)) = out1_dt.s1;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s1)) = out1_dt.s0;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s0)) = out0_dt.s3;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s0)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s0)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s0)) = out0_dt.s0;
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
 }
 
@@ -1012,14 +1029,14 @@ __kernel void winograd_output_transform_4x4_5x5_nchw(
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt                                                = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
-                                                                        B_VAL);
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
+                         B_VAL);
     *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
     *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
     *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
     *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL), 0,
             (__global DATA_TYPE *)(dst_addr));
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
@@ -1135,13 +1152,13 @@ __kernel void winograd_output_transform_4x4_5x5_nchw(
 #endif // defined(HAS_BIAS)
 
     // Store the output tile
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL), 0,
             (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL), 0,
             (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL), 0,
             (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
-    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,
+    vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, (VEC_DATA_TYPE(DATA_TYPE, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL), 0,
             (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
@@ -1151,6 +1168,8 @@ __kernel void winograd_output_transform_4x4_5x5_nchw(
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
  * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
@@ -1201,6 +1220,12 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc(
     int batch = get_global_id(2) / SRC_DEPTH;
 #endif /* defined(SRC_DEPTH) */
 
+    __global unsigned char *dst_base_ptr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE);
+
+#if defined(SRC_DEPTH)
+    dst_base_ptr += batch * dst_stride_w;
+#endif // defined(SRC_DEPTH)
+
     // Load the values across the channels to compose the input tile
     DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
     DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
@@ -1232,30 +1257,37 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc(
 
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Get output address
-#if defined(SRC_DEPTH)
-    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
-#else                                                                               /* defined(SRC_DEPTH) */
-    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
-#endif                                                                              /* defined(SRC_DEPTH) */
-    offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+
+    dst_base_ptr += y_out * dst_stride_y;
+
+    int4 offset_z = min((int4)z_out + (int4)(0, 1, 2, 3), (int4)((int)DST_HEIGHT - 1)) * (int4)dst_stride_z;
 
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt                                      = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
-    *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
-    *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
-    *(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;
-    *(__global DATA_TYPE *)(dst_ptr + offset.s3) = out0_dt.s3;
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
+                         B_VAL);
+
+    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
+    // is overwritten with the valid one
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s3)) = out0_dt.s3;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s2)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s1)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_z.s0)) = out0_dt.s0;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Get output address
-    int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+
+    dst_base_ptr += z_out * dst_stride_z;
+
+    int4 offset_y = min((int4)y_out + (int4)(0, 1, 2, 3), (int4)((int)DST_WIDTH - 1)) * (int4)dst_stride_y;
+
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
+    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
                          B_VAL);
-    *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
-    *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
-    *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = out0_dt.s2;
-    *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = out0_dt.s3;
+
+    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
+    // is overwritten with the valid one
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3)) = out0_dt.s3;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0)) = out0_dt.s0;
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -1368,41 +1400,38 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc(
     out_col2 += (VEC_DATA_TYPE(float, 4))b;
     out_col3 += (VEC_DATA_TYPE(float, 4))b;
 #endif // defined(HAS_BIAS)
-    // Get output address
-#if defined(SRC_DEPTH)
-    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
-#else  /* defined(SRC_DEPTH) */
-    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
-#endif /* defined(SRC_DEPTH) */
-    offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
-    int4 mult_y = min((int4)dst_size - offset, (int4)1);                                 // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
+
+    int4 offset_y = min((int4)y_out + (int4)(0, 1, 2, 3), (int4)((int)DST_WIDTH - 1)) * (int4)dst_stride_y;
+    int4 offset_z = min((int4)z_out + (int4)(0, 1, 2, 3), (int4)((int)DST_HEIGHT - 1)) * (int4)dst_stride_z;
 
     // Store the output tile
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
+    out_col0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
+    out_col1_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_col2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
+    out_col2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col2, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_col3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
-
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2_dt.s0;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3_dt.s0;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2_dt.s1;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3_dt.s1;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0_dt.s2;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1_dt.s2;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2_dt.s2;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3_dt.s2;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0_dt.s3;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1_dt.s3;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2_dt.s3;
-    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3_dt.s3;
+    out_col3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT(out_col3, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), A_VAL, B_VAL);
+
+    // To avoid the out-of-bound write, we store the elements in reverse order so the invalid element
+    // is overwritten with the valid one
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s3)) = out_col3_dt.s3;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s3)) = out_col2_dt.s3;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s3)) = out_col1_dt.s3;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s3)) = out_col0_dt.s3;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s2)) = out_col3_dt.s2;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s2)) = out_col2_dt.s2;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s2)) = out_col1_dt.s2;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s2)) = out_col0_dt.s2;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s1)) = out_col3_dt.s1;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s1)) = out_col2_dt.s1;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s1)) = out_col1_dt.s1;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s1)) = out_col0_dt.s1;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s3 + offset_z.s0)) = out_col3_dt.s0;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s2 + offset_z.s0)) = out_col2_dt.s0;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s1 + offset_z.s0)) = out_col1_dt.s0;
+    *((__global DATA_TYPE *)(dst_base_ptr + offset_y.s0 + offset_z.s0)) = out_col0_dt.s0;
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
 #endif // defined(VEC_SIZE) && VEC_SIZE == 4
@@ -1482,6 +1511,8 @@ __kernel void winograd_output_transform_2x1_3x1_nchw(
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
@@ -1686,6 +1717,8 @@ __kernel void winograd_output_transform_4x1_5x1_nchw(
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
@@ -1752,6 +1785,8 @@ __kernel void winograd_output_transform_4x1_3x1_nhwc(
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
@@ -1890,6 +1925,8 @@ __kernel void winograd_output_transform_1x2_1x3_nchw(
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
@@ -2094,6 +2131,8 @@ __kernel void winograd_output_transform_1x4_1x5_nchw(
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
@@ -2160,6 +2199,8 @@ __kernel void winograd_output_transform_1x4_1x3_nhwc(
  * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
diff --git a/src/core/CL/cl_kernels/yolo_layer.cl b/src/core/CL/cl_kernels/yolo_layer.cl
index 2a15a32e2a..9601dddf67 100644
--- a/src/core/CL/cl_kernels/yolo_layer.cl
+++ b/src/core/CL/cl_kernels/yolo_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(ACTIVATION_TYPE) && defined(NUM_CLASSES) && defined(VEC_SIZE)
+#if defined(DATA_TYPE) && defined(ACTIVATION_TYPE) && defined(NUM_CLASSES) && defined(VEC_SIZE)
 
 #include "activation_float_helpers.h"
 
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
 #if VEC_SIZE != 1
 #define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define SELECT_TYPE VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
 
 /** This performs a YOLO partial activation function for NCHW data layout
  *
@@ -79,7 +80,7 @@ __kernel void yolo_layer_nchw(
     {
         // Load data
         TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
-        data      = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, data, A_VAL, B_VAL); // select(1.0f, ACTIVATION_OP(ACTIVATION_TYPE, data), (SELECT_TYPE)activate);
+        data      = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, data, A_VAL, B_VAL); // select(1.0f, ACTIVATION_OP(ACTIVATION_TYPE, data), (SELECT_TYPE)activate);
 
         // Store result
         VSTORE(VEC_SIZE)
@@ -100,7 +101,6 @@ __kernel void yolo_layer_nchw(
 
 #else // VEC_SIZE != 1
 
-#define SELECT_TYPE SELECT_DATA_TYPE
 /** This performs a YOLO partial activation function for NCHW data layout
  *
  * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
@@ -151,7 +151,7 @@ __kernel void yolo_layer_nhwc(
     {
         // Load data
         DATA_TYPE data = *((__global DATA_TYPE *)input.ptr);
-        data           = select(data, ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, data, A_VAL, B_VAL), (SELECT_TYPE)activate);
+        data           = select(data, ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, data, A_VAL, B_VAL), (SELECT_TYPE)activate);
 
         // Store result
         *((__global DATA_TYPE *)output.ptr) = data;
@@ -169,4 +169,4 @@ __kernel void yolo_layer_nhwc(
 }
 
 #endif // VEC_SIZE != 1
-#endif // defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(ACTIVATION_TYPE) && defined(NUM_CLASSES) && defined(VEC_SIZE)
+#endif // defined(DATA_TYPE) && defined(ACTIVATION_TYPE) && defined(NUM_CLASSES) && defined(VEC_SIZE)
diff --git a/src/core/CL/gemm/CLGEMMHelpers.cpp b/src/core/CL/gemm/CLGEMMHelpers.cpp
index 5734c93021..d60626b158 100644
--- a/src/core/CL/gemm/CLGEMMHelpers.cpp
+++ b/src/core/CL/gemm/CLGEMMHelpers.cpp
@@ -21,12 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <utility>
 
@@ -34,11 +35,13 @@ namespace arm_compute
 {
 namespace cl_gemm
 {
+using namespace arm_compute::misc::shape_calculator;
+
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
                                                                        bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image)
 {
-    v0 = ((m / (m0 * v0)) == 0) ? 1 : v0;
-    h0 = ((n / (n0 * h0)) == 0) ? 1 : h0;
+    v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1));
+    h0 = std::max(std::min(static_cast<int>(n / n0), static_cast<int>(h0)), static_cast<int>(1));
 
     const GEMMLHSMatrixInfo lhs_info(m0, k0, v0, lhs_transpose, lhs_interleave);
     const GEMMRHSMatrixInfo rhs_info(n0, k0, h0, rhs_transpose, rhs_interleave, export_to_cl_image);
@@ -46,6 +49,24 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned
     return std::make_pair(lhs_info, rhs_info);
 }
 
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, data_type);
+    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second);
+    const TensorInfo  tensor_reshaped_info(shape, 1, data_type);
+
+    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
+    {
+        return info_img;
+    }
+    else
+    {
+        return info_buf;
+    }
+}
+
 void update_padding_for_cl_image(ITensorInfo *tensor)
 {
     constexpr unsigned int num_floats_per_pixel = 4;
@@ -65,7 +86,7 @@ Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info,
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.n0 == 2) || (rhs_info.n0 == 3), "Export to cl_image only supported with n0 = 4, 8 or 16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 == 2) || (rhs_info.k0 == 3), "Export to cl_image only supported with k0 = 4, 8 or 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.data_type() != DataType::F32, "Export to cl_image only supported with F32 data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment");
 
diff --git a/arm_compute/core/CL/gemm/CLGEMMHelpers.h b/src/core/CL/gemm/CLGEMMHelpers.h
similarity index 75%
rename from arm_compute/core/CL/gemm/CLGEMMHelpers.h
rename to src/core/CL/gemm/CLGEMMHelpers.h
index 013c068cf7..57624673c0 100644
--- a/arm_compute/core/CL/gemm/CLGEMMHelpers.h
+++ b/src/core/CL/gemm/CLGEMMHelpers.h
@@ -54,6 +54,25 @@ namespace cl_gemm
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
                                                                        bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false);
 
+/** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ *
+ * This function accepts two pairs of GEMMLHSMatrixInfo/GEMMRHSMatrixInfo where only the first is with cl_image2d support,
+ * and selects the valid one validating the GEMMRHSMatrixInfo. If the validation passes, the functions will return
+ * the first GEMMLHSMatrixInfo/GEMMRHSMatrixInfo pair with cl_image2d support.
+ *
+ * @param[in] info_img  GEMMLHSMatrixInfo/GEMMRHSMatrixInfo with cl_image2d support
+ * @param[in] info_buf  GEMMLHSMatrixInfo/GEMMRHSMatrixInfo to fall-back if cl_image2d cannot be used
+ * @param[in] n         Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] k         Number of rows (K) in the RHS matrix not reshaped
+ * @param[in] b         Batch size
+ * @param[in] data_type Data type
+ *
+ * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ */
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type);
+
 /** Update padding required to export the OpenCL buffer to OpenCL image2d
  *
  * @param[in,out] tensor ITensorInfo of the tensor required to be exported to OpenCL image2d
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h b/src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
similarity index 87%
rename from arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
rename to src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
index 7270a8e6db..aecf5a8aa8 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
@@ -24,12 +24,12 @@
 #ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H
 #define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h"
 
-#include <memory>
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp
index 51b7fc7190..4cc3d6ae74 100644
--- a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
similarity index 97%
rename from arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
rename to src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
index 1e4989615e..1e7432c89a 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONBIFROST_H
 #define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONBIFROST_H
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp
index 3e7c17664a..fd699a08f7 100644
--- a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
similarity index 97%
rename from arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
rename to src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
index 4cebfceb75..2f6671706e 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
 #define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp
index efc82fb78c..2c82340eef 100644
--- a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
similarity index 97%
rename from arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
rename to src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
index 07389ea76f..fb51b02edf 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONVALHALL_H
 #define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONVALHALL_H
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
similarity index 90%
rename from arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
rename to src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
index b953fd264f..21ccf2d647 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
@@ -24,11 +24,11 @@
 #ifndef ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H
 #define ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
+#include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h"
 
-#include <memory>
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
index a533f14d02..46eeff3524 100644
--- a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
@@ -21,15 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
+#include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
@@ -60,6 +60,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfiguratio
         { DataType::QSYMM8_PER_CHANNEL, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_u8 }
     };
 
+    // Configurations for Mali-G52
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G52 =
+    {
+        { DataType::F32, &CLGEMMReshapedKernelConfigurationBifrost::configure_G52_f32 },
+        { DataType::F16, &CLGEMMReshapedKernelConfigurationBifrost::configure_G52_f16 },
+        { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8 },
+        { DataType::QSYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8 },
+        { DataType::QASYMM8_SIGNED, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8 },
+        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8 }
+    };
+
     // Configurations for Mali-G7x
     static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G7x =
     {
@@ -153,6 +164,121 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfiguratio
     }
 }
 
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
+    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
+
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    if(workload <= 274.4000f)
+    {
+        if(r_nk <= 0.7461f)
+        {
+            if(r_mn <= 21.1667f)
+            {
+                return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false);
+            }
+            else
+            {
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf),
+                                           n, k, b, DataType::F32);
+            }
+        }
+        else
+        {
+            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                       std::make_pair(lhs_info_buf, rhs_info_buf),
+                                       n, k, b, DataType::F32);
+        }
+    }
+    else
+    {
+        if(r_mk <= 17.3926f)
+        {
+            if(workload <= 542.4000f)
+            {
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf),
+                                           n, k, b, DataType::F32);
+            }
+            else
+            {
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf),
+                                           n, k, b, DataType::F32);
+            }
+        }
+        else
+        {
+            if(r_nk <= 0.5463f)
+            {
+                if(workload <= 11767.6001f)
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf),
+                                               n, k, b, DataType::F32);
+                }
+                else
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf),
+                                               n, k, b, DataType::F32);
+                }
+            }
+            else
+            {
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf),
+                                           n, k, b, DataType::F32);
+            }
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+    if(workload <= 323.4000f)
+    {
+        return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 8, 4, 2, 2, true, true, true, false, false);
+    }
+}
+
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
@@ -205,16 +331,30 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfiguratio
 
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+    const float r_mk = static_cast<float>(m) / static_cast<float>(k);
 
-    if(n <= 4)
+    if(workload <= 1595.2000f)
     {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 8, 2, true, true, true, false);
+        if(r_mk <= 2.1044f)
+        {
+            if(workload <= 870.4000f)
+            {
+                return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false);
+            }
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false);
+        }
     }
     else
     {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 8, true, true, true, false);
+        return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false, false);
     }
 }
 
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
similarity index 89%
rename from arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
rename to src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
index 4df27843aa..715e7abfa3 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H
 #define ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
 
 namespace arm_compute
 {
@@ -45,8 +45,10 @@ class CLGEMMReshapedKernelConfigurationBifrost final : public ICLGEMMKernelConfi
 
 private:
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp
index 0c09f5084a..4fd446f647 100644
--- a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h"
+#include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
@@ -90,13 +90,123 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfiguratio
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+    const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+    const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, false, false, true, false, false);
+
+    if(r_mk <= 0.11824845522642136)
     {
-        return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false);
+        if(workload <= 880.0)
+        {
+            return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, false, false, true, false, false);
+        }
+        else
+        {
+            if(r_nk <= 0.42521367967128754)
+            {
+                if(workload <= 1726.4000244140625)
+                {
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, false, false, true, false, false);
+                }
+                else
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, false, true, true, false, true);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf),
+                                               n, k, b, DataType::F16);
+                }
+            }
+            else
+            {
+                if(workload <= 1241.6000366210938)
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, false, false, true, false, false);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, false, false, true, false, false);
+                }
+            }
+        }
     }
     else
     {
-        return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false);
+        if(workload <= 11404.7998046875)
+        {
+            if(r_mk <= 1.0126488208770752)
+            {
+                if(r_mn <= 2.545312523841858)
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, false, true, true, false, true);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf),
+                                               n, k, b, DataType::F16);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, false, false, true, false, false);
+                }
+            }
+            else
+            {
+                if(workload <= 2881.199951171875)
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, false, false, true, false, true);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf),
+                                               n, k, b, DataType::F16);
+                }
+                else
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, false, true, true, false, true);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf),
+                                               n, k, b, DataType::F16);
+                }
+            }
+        }
+        else
+        {
+            if(r_nk <= 0.5765306055545807)
+            {
+                if(r_mn <= 6.010416746139526)
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, false, true, true, false, true);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf),
+                                               n, k, b, DataType::F16);
+                }
+                else
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, false, true, false, true);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf),
+                                               n, k, b, DataType::F16);
+                }
+            }
+            else
+            {
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, false, true, false, true);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                            std::make_pair(lhs_info_buf, rhs_info_buf),
+                                            n, k, b, DataType::F16);
+            }
+        }
     }
 }
 
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
similarity index 97%
rename from arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
rename to src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
index 7a617e05be..5f7e701e0e 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONVALHALL_H
 #define ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONVALHALL_H
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
similarity index 89%
rename from arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
rename to src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
index 6d5ce8835b..4efe28ce69 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
@@ -24,11 +24,11 @@
 #ifndef ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H
 #define ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h"
 
-#include <memory>
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
index f9b65dc931..d5b76d8eaf 100644
--- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
@@ -21,15 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
@@ -61,6 +61,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
         { DataType::QSYMM8_PER_CHANNEL, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_u8 }
     };
 
+    // Configurations for Mali-G52
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G52 =
+    {
+        { DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G52_f32 },
+        { DataType::F16, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G52_f16 },
+        { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 },
+        { DataType::QSYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 },
+        { DataType::QASYMM8_SIGNED, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 },
+        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 }
+    };
+
     // Configurations for Mali-G76
     static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G76 =
     {
@@ -94,6 +105,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
             {
                 ARM_COMPUTE_ERROR("Not supported data type");
             }
+        case GPUTarget::G52:
+            if(gemm_configs_G52.find(data_type) != gemm_configs_G52.end())
+            {
+                return (this->*gemm_configs_G52[data_type])(m, n, k, b);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Not supported data type");
+            }
         case GPUTarget::G51:
             if(gemm_configs_G51.find(data_type) != gemm_configs_G51.end())
             {
@@ -122,15 +142,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
 
     if(m == 1)
     {
-        if(n > 2048)
+        if(n <= 2548)
         {
-            const unsigned int h0 = std::max(n / 4, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
+            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false);
         }
         else
         {
-            const unsigned int h0 = std::max(n / 2, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+            return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 8, false, true, false, true, false);
         }
     }
     else
@@ -150,18 +168,25 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
     GEMMRHSMatrixInfo rhs_info_img;
 
     const bool is_workload_big = ((m * n * b) / 16) >= 2048;
-    // Get lhs_info/rhs_info in case of OpenCL buffer
+
     if(m == 1)
     {
-        if((n / 4) >= 2048)
+        if(n >= 8192)
         {
             const unsigned int h0 = std::max(n / 4, 1U);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true);
+            return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false);
         }
         else
         {
             const unsigned int h0 = std::max(n / 2, 1U);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+            if(n <= 204)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true, false);
+            }
         }
     }
     else
@@ -205,6 +230,50 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
     }
 }
 
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
+
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    if(m == 1)
+    {
+        if(r_nk <= 0.4664f)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false);
+        }
+        else
+        {
+            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
+
+            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                       std::make_pair(lhs_info_buf, rhs_info_buf),
+                                       n, k, b, DataType::F32);
+        }
+    }
+    else
+    {
+        if(workload <= 274.4000f)
+        {
+            return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false);
+        }
+        else
+        {
+            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
+
+            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                       std::make_pair(lhs_info_buf, rhs_info_buf),
+                                       n, k, b, DataType::F32);
+        }
+    }
+}
+
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
@@ -246,19 +315,162 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
     }
 }
 
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+    const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+    const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    if(m == 1)
+    {
+        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
+
+        if(r_mk <= 0.0026f)
+        {
+            if(r_nk <= 0.4664f)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
+            }
+            else
+            {
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf),
+                                           n, k, b, DataType::F16);
+            }
+        }
+        else
+        {
+            if(r_mk <= 0.0148f)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
+            }
+            else
+            {
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf),
+                                           n, k, b, DataType::F16);
+            }
+        }
+    }
+    else
+    {
+        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
+
+        if(workload <= 362.6000f)
+        {
+            return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
+        }
+        else
+        {
+            if(r_mn <= 22.6067f)
+            {
+                if(workload <= 708.8000f)
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf),
+                                               n, k, b, DataType::F16);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 16, false, false, false, false, false);
+                }
+            }
+            else
+            {
+                if(r_nk <= 0.0917f)
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
+                }
+                else
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf),
+                                               n, k, b, DataType::F16);
+                }
+            }
+        }
+    }
+}
+
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
 
     if(m == 1)
     {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+        return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
     }
     else
     {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
+        const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+        const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+        if(workload <= 7449.60f)
+        {
+            if(workload <= 691.60f)
+            {
+                return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false);
+            }
+            else
+            {
+                if(workload <= 4155.20f)
+                {
+                    return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 32, false, false, false, false, false);
+                }
+            }
+        }
+        else
+        {
+            if(workload <= 16300.80f)
+            {
+                if(r_mn <= 44.56f)
+                {
+                    GEMMLHSMatrixInfo lhs_info_buf;
+                    GEMMRHSMatrixInfo rhs_info_buf;
+                    GEMMLHSMatrixInfo lhs_info_img;
+                    GEMMRHSMatrixInfo rhs_info_img;
+
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf),
+                                               n, k, b, DataType::F16);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+                }
+            }
+            else
+            {
+                GEMMLHSMatrixInfo lhs_info_buf;
+                GEMMRHSMatrixInfo rhs_info_buf;
+                GEMMLHSMatrixInfo lhs_info_img;
+                GEMMRHSMatrixInfo rhs_info_img;
+
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
+                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf),
+                                           n, k, b, DataType::F16);
+            }
+        }
     }
 }
 
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
similarity index 90%
rename from arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
rename to src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
index 346bfd7b91..4d284ed3e8 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H
 #define ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
 
 namespace arm_compute
 {
@@ -46,8 +46,10 @@ class CLGEMMReshapedOnlyRHSKernelConfigurationBifrost final : public ICLGEMMKern
 private:
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
index 9f3461e912..e0991674b1 100644
--- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
@@ -21,15 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
@@ -78,66 +78,107 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
 
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    ARM_COMPUTE_UNUSED(k);
-
-    GEMMLHSMatrixInfo lhs_info_buf;
-    GEMMRHSMatrixInfo rhs_info_buf;
-    GEMMLHSMatrixInfo lhs_info_img;
-    GEMMRHSMatrixInfo rhs_info_img;
-
-    // Get lhs_info/rhs_info in case of OpenCL buffer
     if(m == 1)
     {
-        const unsigned int h0 = std::max(n / 4, 1U);
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        if(m > 256)
+        const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+        const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+
+        if(r_mk <= 0.0064484127797186375)
         {
-            const int v0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(8)), static_cast<int>(1));
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, v0, false, true, false, true);
+            if(r_mn <= 0.0028273810748942196)
+            {
+                GEMMLHSMatrixInfo lhs_info_buf;
+                GEMMRHSMatrixInfo rhs_info_buf;
+                GEMMLHSMatrixInfo lhs_info_img;
+                GEMMRHSMatrixInfo rhs_info_img;
+
+                const unsigned int h0 = std::max(n / 4, 1U);
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, false, true);
+                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true, false);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf),
+                                           n, k, b, DataType::F32);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 8, false, true, false, false, false);
+            }
         }
         else
         {
-            const int v0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(8)), static_cast<int>(1));
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, v0, false, true, false, true);
+            if(r_mk <= 0.020312500186264515)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, false, false);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, true, false);
+            }
         }
     }
-
-    // Get lhs_info/rhs_info in case of OpenCL image
-    if(m == 1)
-    {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 8, true, true, false, false, true);
-    }
     else
     {
-        if((m / 4) * (n / 4) > 4096)
+        const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+        const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+        const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+
+        if(workload <= 1999.2000122070312)
         {
-            const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(8)), static_cast<int>(1));
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
+            if(workload <= 747.1999816894531)
+            {
+                return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, false, true, false, true, false);
+            }
+            else
+            {
+                GEMMLHSMatrixInfo lhs_info_buf;
+                GEMMRHSMatrixInfo rhs_info_buf;
+                GEMMLHSMatrixInfo lhs_info_img;
+                GEMMRHSMatrixInfo rhs_info_img;
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, false, false, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, false, true, false, true, false);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf),
+                                           n, k, b, DataType::F32);
+            }
         }
         else
         {
-            const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(8)), static_cast<int>(1));
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, h0, false, true, false, false, true);
-        }
-    }
-
-    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
-    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
-    const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
+            if(r_mn <= 0.03348214365541935)
+            {
+                if(r_mk <= 0.028125000186264515)
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, false, true, false, true, false);
+                }
+                else
+                {
+                    GEMMLHSMatrixInfo lhs_info_buf;
+                    GEMMRHSMatrixInfo rhs_info_buf;
+                    GEMMLHSMatrixInfo lhs_info_img;
+                    GEMMRHSMatrixInfo rhs_info_img;
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, false, false, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, false, true, false, true, false);
 
-    // In case of small workloads, we use the OpenCL buffer rather than the OpenCL image2d
-    const bool use_cl_image2d = ((m / lhs_info_img.m0) * (n / rhs_info_img.n0)) * b < 1024 ? false : true;
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf),
+                                               n, k, b, DataType::F32);
+                }
+            }
+            else
+            {
+                GEMMLHSMatrixInfo lhs_info_buf;
+                GEMMRHSMatrixInfo rhs_info_buf;
+                GEMMLHSMatrixInfo lhs_info_img;
+                GEMMRHSMatrixInfo rhs_info_img;
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, false, true);
+                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, false, true, false, true, false);
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
-    {
-        return std::make_pair(lhs_info_img, rhs_info_img);
-    }
-    else
-    {
-        return std::make_pair(lhs_info_buf, rhs_info_buf);
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                            std::make_pair(lhs_info_buf, rhs_info_buf),
+                                            n, k, b, DataType::F32);
+            }
+        }
     }
 }
 
@@ -148,15 +189,14 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
 
     if(m == 1)
     {
-        if(n > 2048)
+        const unsigned int h0 = std::max(n / 2, 1U);
+        if(n <= 836.0)
         {
-            const unsigned int h0 = std::max(n / 4, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
+            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false);
         }
         else
         {
-            const unsigned int h0 = std::max(n / 2, 1U);
-            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true, false);
         }
     }
     else if(m < 128)
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
similarity index 97%
rename from arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
rename to src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
index 2162baf338..b9289923b9 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONVALHALL_H
 #define ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONVALHALL_H
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
index 9deb16524e..76b60cb9f8 100644
--- a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
+++ b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
@@ -21,18 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+
+#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
rename to src/core/CL/kernels/CLAbsoluteDifferenceKernel.h
index f62855cbb9..28f28fe44f 100644
--- a/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
+++ b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H
 #define ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp
index f161906646..b0a8eba644 100644
--- a/src/core/CL/kernels/CLAccumulateKernel.cpp
+++ b/src/core/CL/kernels/CLAccumulateKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+#include "src/core/CL/kernels/CLAccumulateKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
diff --git a/arm_compute/core/CL/kernels/CLAccumulateKernel.h b/src/core/CL/kernels/CLAccumulateKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLAccumulateKernel.h
rename to src/core/CL/kernels/CLAccumulateKernel.h
index e067da084f..16a715319d 100644
--- a/arm_compute/core/CL/kernels/CLAccumulateKernel.h
+++ b/src/core/CL/kernels/CLAccumulateKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLACCUMULATEKERNEL_H
 #define ARM_COMPUTE_CLACCUMULATEKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 #include <cstdint>
 
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 62cafc5ad1..9f9538cb76 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "src/core/CL/kernels/CLActivationLayerKernel.h"
 
 #include "arm_compute/core/CL/CLCoreRuntimeContext.h"
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
 #include "support/StringSupport.h"
 
 #include <set>
@@ -80,37 +80,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    if(output != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-        // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output, *input);
-    }
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    bool   window_changed = false;
-
-    if(output != nullptr)
-    {
-        AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = update_window_and_padding(win, input_access, output_access);
-        output_access.set_valid_region(win, input->valid_region());
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win,
-                                                   AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 CLActivationLayerKernel::CLActivationLayerKernel()
@@ -122,6 +91,8 @@ void CLActivationLayerKernel::configure(const CLCompileContext &compile_context,
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
+    auto padding_info = get_padding_info({ input, output });
+
     _run_in_place = (output == nullptr) || (output == input);
 
     if(output != nullptr)
@@ -132,10 +103,11 @@ void CLActivationLayerKernel::configure(const CLCompileContext &compile_context,
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, (output != nullptr) ? output : nullptr, act_info));
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-    const DataType     dt                                = input->data_type();
-    float              a_const                           = act_info.a();
-    float              b_const                           = act_info.b();
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
+
+    const DataType dt      = input->data_type();
+    float          a_const = act_info.a();
+    float          b_const = act_info.b();
 
     const ActivationLayerInfo::ActivationFunction f_act        = act_info.activation();
     const bool                                    is_quantized = is_data_type_quantized(dt);
@@ -146,9 +118,10 @@ void CLActivationLayerKernel::configure(const CLCompileContext &compile_context,
     CLBuildOptions build_opts;
     build_opts.add_option_if(perform_activation_in_float, "-DFLOAT_DOMAIN");
     build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
-    build_opts.add_option(("-DACT=" + lower_string(string_from_activation_func(f_act))));
-    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
-    build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act)));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
 
     std::string kernel_name = std::string("activation_layer");
 
@@ -226,9 +199,8 @@ void CLActivationLayerKernel::configure(const CLCompileContext &compile_context,
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input, (_run_in_place) ? nullptr : output);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
 
     // Set config_id for enabling LWS tuning
     _config_id = "activation_layer_";
@@ -237,14 +209,13 @@ void CLActivationLayerKernel::configure(const CLCompileContext &compile_context,
     _config_id += support::cpp11::to_string(input->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(input->dimension(1));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLActivationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()).first);
-
     return Status{};
 }
 
diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/src/core/CL/kernels/CLActivationLayerKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLActivationLayerKernel.h
rename to src/core/CL/kernels/CLActivationLayerKernel.h
index 81d4ccb065..821418f835 100644
--- a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
+++ b/src/core/CL/kernels/CLActivationLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index b78ac27cfa..0e6fc6599c 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
@@ -21,18 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h"
+#include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h
rename to src/core/CL/kernels/CLArgMinMaxLayerKernel.h
index 48876c0b56..929677f905 100644
--- a/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
 #define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
index feebe01cdb..ccd6a5a0fc 100644
--- a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
@@ -21,16 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
@@ -38,25 +38,6 @@ namespace arm_compute
 {
 namespace
 {
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output)
-{
-    ARM_COMPUTE_UNUSED(batch_offset);
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-    // The window needs to be based on output, except for the batch size
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-    // The total batch size is the concatenation of the batch size of the inputs
-    win.set(3, Window::Dimension(0, input->tensor_shape()[3], 1));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 Status validate_arguments(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -84,14 +65,17 @@ void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_co
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, batch_offset, output));
 
+    auto padding_info = get_padding_info({ input, output });
+
     _batch_offset = batch_offset;
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
 
     // Add build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
     if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
     {
         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
@@ -107,10 +91,9 @@ void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_co
     _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input, batch_offset, output);
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    ICLKernel::configure_internal(std::get<1>(win_config));
+    auto win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    win.set(3, Window::Dimension(0, input->tensor_shape()[3], 1));
+    ICLKernel::configure_internal(win);
 
     // Set output valid region
     output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
@@ -128,6 +111,8 @@ void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_co
     _config_id += support::cpp11::to_string(input->dimension(2));
     _config_id += "_";
     _config_id += support::cpp11::to_string(input->dimension(3));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLBatchConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input,
@@ -135,7 +120,6 @@ Status CLBatchConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *i
                                                const arm_compute::ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, batch_offset, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), batch_offset, output->clone().get()).first);
     return Status{};
 }
 
diff --git a/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h
rename to src/core/CL/kernels/CLBatchConcatenateLayerKernel.h
index bb8968ca83..54a89eb243 100644
--- a/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.h
@@ -25,8 +25,8 @@
 #ifndef ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H
 #define ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index a2cabcfd1f..44bdc6f587 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -21,16 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
@@ -80,16 +81,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
-                                                        ITensorInfo *mean, ITensorInfo *var, ITensorInfo *beta, ITensorInfo *gamma)
+std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
 {
-    if(output != nullptr)
-    {
-        // Output tensor auto initialization if not yet initialized
-        auto_init_if_empty(*output, *input->clone());
-    }
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
 
     // Configure kernel window
     Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -107,25 +101,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         window_changed = update_window_and_padding(win, input_access);
     }
 
-    // Mean, var, gamma and beta get parallelized for the NHWC case as they follow the channel dimension, which is along the first axis
-    if(input->data_layout() == DataLayout::NHWC)
-    {
-        AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
-        AccessWindowHorizontal var_access(var, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, mean_access, var_access);
-
-        if(beta != nullptr)
-        {
-            AccessWindowHorizontal beta_access(beta, 0, num_elems_processed_per_iteration);
-            window_changed = window_changed || update_window_and_padding(win, beta_access);
-        }
-        if(gamma != nullptr)
-        {
-            AccessWindowHorizontal gamma_access(gamma, 0, num_elems_processed_per_iteration);
-            window_changed = window_changed || update_window_and_padding(win, gamma_access);
-        }
-    }
-
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
@@ -148,13 +123,14 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
 
-    _input   = input;
-    _output  = output;
-    _mean    = mean;
-    _var     = var;
-    _beta    = beta;
-    _gamma   = gamma;
-    _epsilon = epsilon;
+    auto padding_info = get_padding_info({ input, output, mean, var, beta, gamma });
+    _input            = input;
+    _output           = output;
+    _mean             = mean;
+    _var              = var;
+    _beta             = beta;
+    _gamma            = gamma;
+    _epsilon          = epsilon;
 
     _run_in_place = (output == nullptr) || (output == input);
 
@@ -162,12 +138,13 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
                                                   mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
                                                   (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
@@ -191,13 +168,26 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
     }
     _kernel.setArg<cl_float>(idx++, _epsilon);
 
+    if(output != nullptr)
+    {
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*output->info(), *input->info()->clone());
+    }
+
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(),
-                                                    mean->info(), var->info(),
-                                                    (beta != nullptr) ? beta->info() : nullptr,
-                                                    (gamma != nullptr) ? gamma->info() : nullptr);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    if(input->info()->data_layout() == DataLayout::NHWC)
+    {
+        Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+        ICLKernel::configure_internal(win);
+    }
+    else
+    {
+        auto win_config = validate_and_configure_window_nchw(input->info(), (_run_in_place) ? nullptr : output->info());
+        ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+        ICLKernel::configure_internal(win_config.second);
+    }
+
+    ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
 
     _config_id = "batch_normalization_layer_";
     _config_id += string_from_data_type(input->info()->data_type());
@@ -218,11 +208,12 @@ Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(),
-                                                              mean->clone().get(), var->clone().get(),
-                                                              (beta != nullptr) ? beta->clone().get() : nullptr,
-                                                              (gamma != nullptr) ? gamma->clone().get() : nullptr)
-                                .first);
+
+    if(input->data_layout() != DataLayout::NHWC)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
+                                    .first);
+    }
 
     return Status{};
 }
diff --git a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
rename to src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
index c556a0c6f4..743f4a9594 100644
--- a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index c74f7e055b..da41feb7b8 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -91,6 +93,9 @@ void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTenso
 void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    auto padding_info = get_padding_info({ input, block_shape, output });
+
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
 
     _input       = input;
@@ -109,6 +114,8 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
     ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output)
diff --git a/arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h
rename to src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
index 7af88d8986..131a43e59c 100644
--- a/arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
 #define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLBitwiseAndKernel.cpp b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
index 44378c8239..91a659284a 100644
--- a/src/core/CL/kernels/CLBitwiseAndKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
@@ -21,15 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+#include "src/core/CL/kernels/CLBitwiseAndKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h b/src/core/CL/kernels/CLBitwiseAndKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
rename to src/core/CL/kernels/CLBitwiseAndKernel.h
index e291f08b9a..01018ee09d 100644
--- a/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
+++ b/src/core/CL/kernels/CLBitwiseAndKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLBITWISEANDKERNEL_H
 #define ARM_COMPUTE_CLBITWISEANDKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLBitwiseNotKernel.cpp b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
index 08e4c54957..118bfe8139 100644
--- a/src/core/CL/kernels/CLBitwiseNotKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+#include "src/core/CL/kernels/CLBitwiseNotKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
diff --git a/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h b/src/core/CL/kernels/CLBitwiseNotKernel.h
similarity index 97%
rename from arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
rename to src/core/CL/kernels/CLBitwiseNotKernel.h
index f57bbf4778..bf68bc7ae5 100644
--- a/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
+++ b/src/core/CL/kernels/CLBitwiseNotKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLBITWISENOTKERNEL_H
 #define ARM_COMPUTE_CLBITWISENOTKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLBitwiseOrKernel.cpp b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
index 77c48e6e82..8954d9aa6d 100644
--- a/src/core/CL/kernels/CLBitwiseOrKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
@@ -21,15 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+#include "src/core/CL/kernels/CLBitwiseOrKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h b/src/core/CL/kernels/CLBitwiseOrKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
rename to src/core/CL/kernels/CLBitwiseOrKernel.h
index 944224ecb9..c27d0c27e2 100644
--- a/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
+++ b/src/core/CL/kernels/CLBitwiseOrKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLBITWISEORKERNEL_H
 #define ARM_COMPUTE_CLBITWISEORKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLBitwiseXorKernel.cpp b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
index a15305e3b7..69eb38e2e6 100644
--- a/src/core/CL/kernels/CLBitwiseXorKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
@@ -21,15 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+#include "src/core/CL/kernels/CLBitwiseXorKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h b/src/core/CL/kernels/CLBitwiseXorKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
rename to src/core/CL/kernels/CLBitwiseXorKernel.h
index 732ae8659e..b4861ea757 100644
--- a/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
+++ b/src/core/CL/kernels/CLBitwiseXorKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLBITWISEXORKERNEL_H
 #define ARM_COMPUTE_CLBITWISEXORKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 95ea3d7df5..bcfd9b8e5a 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -21,19 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h
rename to src/core/CL/kernels/CLBoundingBoxTransformKernel.h
index 4e8c5a6f18..08f350e86a 100644
--- a/arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H
 #define ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
index 7916dce241..9f493b4fb8 100644
--- a/src/core/CL/kernels/CLBox3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLBox3x3Kernel.cpp
@@ -21,14 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
+#include "src/core/CL/kernels/CLBox3x3Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/arm_compute/core/CL/kernels/CLBox3x3Kernel.h b/src/core/CL/kernels/CLBox3x3Kernel.h
similarity index 97%
rename from arm_compute/core/CL/kernels/CLBox3x3Kernel.h
rename to src/core/CL/kernels/CLBox3x3Kernel.h
index 1a8572dd68..2373c4a928 100644
--- a/arm_compute/core/CL/kernels/CLBox3x3Kernel.h
+++ b/src/core/CL/kernels/CLBox3x3Kernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLBOX3X3KERNEL_H
 #define ARM_COMPUTE_CLBOX3X3KERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
index b8a53650e8..1fe944c8a2 100644
--- a/src/core/CL/kernels/CLCannyEdgeKernel.cpp
+++ b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
+#include "src/core/CL/kernels/CLCannyEdgeKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h b/src/core/CL/kernels/CLCannyEdgeKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
rename to src/core/CL/kernels/CLCannyEdgeKernel.h
index c4d0297aef..7543822d8d 100644
--- a/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
+++ b/src/core/CL/kernels/CLCannyEdgeKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLCANNYEDGEKERNEL_H
 #define ARM_COMPUTE_CLCANNYEDGEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
index b0e5111417..52ba9dd065 100644
--- a/src/core/CL/kernels/CLChannelCombineKernel.cpp
+++ b/src/core/CL/kernels/CLChannelCombineKernel.cpp
@@ -21,20 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+#include "src/core/CL/kernels/CLChannelCombineKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLMultiImage.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/MultiImageInfo.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/arm_compute/core/CL/kernels/CLChannelCombineKernel.h b/src/core/CL/kernels/CLChannelCombineKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLChannelCombineKernel.h
rename to src/core/CL/kernels/CLChannelCombineKernel.h
index f9c33df7c1..f19995aa8e 100644
--- a/arm_compute/core/CL/kernels/CLChannelCombineKernel.h
+++ b/src/core/CL/kernels/CLChannelCombineKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H
 #define ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include <array>
 #include <cstdint>
diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
index 13ae8f5ef4..cbf504b98b 100644
--- a/src/core/CL/kernels/CLChannelExtractKernel.cpp
+++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp
@@ -21,21 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+#include "src/core/CL/kernels/CLChannelExtractKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLMultiImage.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/MultiImageInfo.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h b/src/core/CL/kernels/CLChannelExtractKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLChannelExtractKernel.h
rename to src/core/CL/kernels/CLChannelExtractKernel.h
index 1ccf38bb8c..37abde548c 100644
--- a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
+++ b/src/core/CL/kernels/CLChannelExtractKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H
 #define ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include <cstdint>
 
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index ad000ba17f..c969792c3e 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -21,16 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h
rename to src/core/CL/kernels/CLChannelShuffleLayerKernel.h
index bf58525248..31c007f17e 100644
--- a/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H
 #define ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 4050b24e0c..44b8471725 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -21,17 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+#include "src/core/CL/kernels/CLCol2ImKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/src/core/CL/kernels/CLCol2ImKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLCol2ImKernel.h
rename to src/core/CL/kernels/CLCol2ImKernel.h
index c3a1ff3a50..710e048bca 100644
--- a/arm_compute/core/CL/kernels/CLCol2ImKernel.h
+++ b/src/core/CL/kernels/CLCol2ImKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLCOL2IMKERNEL_H
 #define ARM_COMPUTE_CLCOL2IMKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
index e14b871ae6..6c61fec997 100644
--- a/src/core/CL/kernels/CLColorConvertKernel.cpp
+++ b/src/core/CL/kernels/CLColorConvertKernel.cpp
@@ -21,20 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+#include "src/core/CL/kernels/CLColorConvertKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLMultiImage.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/MultiImageInfo.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <sstream>
diff --git a/arm_compute/core/CL/kernels/CLColorConvertKernel.h b/src/core/CL/kernels/CLColorConvertKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLColorConvertKernel.h
rename to src/core/CL/kernels/CLColorConvertKernel.h
index d57bb3de03..0f082914cd 100644
--- a/arm_compute/core/CL/kernels/CLColorConvertKernel.h
+++ b/src/core/CL/kernels/CLColorConvertKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLCOLORCONVERTKERNEL_H
 #define ARM_COMPUTE_CLCOLORCONVERTKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index 5bb1d56690..e2aee36bd8 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -21,11 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <map>
diff --git a/arm_compute/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLComparisonKernel.h
rename to src/core/CL/kernels/CLComparisonKernel.h
index bbf5f19e2f..0b94190183 100644
--- a/arm_compute/core/CL/kernels/CLComparisonKernel.h
+++ b/src/core/CL/kernels/CLComparisonKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLCOMPARISONKERNEL_H
 #define ARM_COMPUTE_CLCOMPARISONKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
index 7c6114640c..dcf4e6662e 100644
--- a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
@@ -21,14 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
+#include "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -52,6 +53,8 @@ void CLConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &com
     // Output tensor auto initialisation if not yet initialized
     auto_init_if_empty(*output->info(), *input->info()->clone());
 
+    auto padding_info = get_padding_info({ input, output });
+
     ARM_COMPUTE_ERROR_THROW_ON(CLConvertFullyConnectedWeightsKernel::validate(input->info(), output->info(), original_input_shape, data_layout));
 
     _input  = input;
@@ -81,6 +84,8 @@ void CLConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &com
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
     ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
diff --git a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
rename to src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
index 5d9e9bdd85..d1da793df2 100644
--- a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
+++ b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
 #define ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
index ca07e68345..21f1047cc6 100644
--- a/src/core/CL/kernels/CLConvolutionKernel.cpp
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp
@@ -21,19 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+#include "src/core/CL/kernels/CLConvolutionKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/arm_compute/core/CL/kernels/CLConvolutionKernel.h b/src/core/CL/kernels/CLConvolutionKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLConvolutionKernel.h
rename to src/core/CL/kernels/CLConvolutionKernel.h
index 0f500fb63a..33e73caf11 100644
--- a/arm_compute/core/CL/kernels/CLConvolutionKernel.h
+++ b/src/core/CL/kernels/CLConvolutionKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLCONVOLUTIONKERNEL_H
 #define ARM_COMPUTE_CLCONVOLUTIONKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 #include <cstdint>
 
diff --git a/src/core/CL/kernels/CLCopyKernel.cpp b/src/core/CL/kernels/CLCopyKernel.cpp
index 37c3241302..ca38b65df4 100644
--- a/src/core/CL/kernels/CLCopyKernel.cpp
+++ b/src/core/CL/kernels/CLCopyKernel.cpp
@@ -21,133 +21,42 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Utils.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, Window *output_window = nullptr)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON(!padding.empty() && output_window != nullptr);
-    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
 
     // Validate output if initialized
     if(output->total_size() != 0)
     {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
         if(output_window == nullptr)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding), output->tensor_shape());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
         }
         else
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output_window->shape());
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, Window *output_window)
-{
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, *input);
-
-    // Configure window
-    const unsigned int vec_size_x = 16 / input->element_size();
-
-    if(output_window == nullptr)
-    {
-        // Create and update the window (if needed)
-        Window win = calculate_max_window(*input, Steps(vec_size_x));
-
-        AccessWindowHorizontal input_access(input, 0, vec_size_x);
-        AccessWindowHorizontal output_access(output, 0, vec_size_x);
-
-        bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-        return std::make_pair(err, win);
-    }
-    else
-    {
-        Window win = calculate_max_window(*input);
-        return std::make_pair(Status{}, win);
-    }
-}
-
-std::pair<Status, Window> validate_and_configure_window_with_padding(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding)
-{
-    TensorShape input_shape  = input->tensor_shape();
-    TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input_shape, padding);
-
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape));
-
-    // Configure window
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    // Pad on the x dimension accounting for the padding offset along the same dimension
-    AccessWindowHorizontal output_access(output, padding[0].first, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-/** Generate the string "-DPAD= @p dim @p index @p padding"
- *
- * @param[in] dim     The dimension index
- * @param[in] index   Can be 0 for the start dimension and 1 for the end dimension
- * @param[in] padding The value to pad for that index/dimension pair
- *
- * @return The correct concatenated string
- */
-std::string generate_pad_string(const size_t dim, const size_t index, const size_t padding)
-{
-    return "-DPAD" + support::cpp11::to_string(dim) + support::cpp11::to_string(index) + "=" + support::cpp11::to_string(padding);
-}
-
-/** Pass the padding as build option to the kernel.
- *
- * @param[in]  tensor     The padded tensor
- * @param[in]  padding    The list of the padding for each dimension
- * @param[out] build_opts The build option to which adding the padding
- */
-void add_padding_as_build_options(const PaddingList &padding, CLBuildOptions &build_opts)
-{
-    size_t dim = 0;
-    for(dim = 0; dim < padding.size(); dim++)
-    {
-        build_opts.add_option(generate_pad_string(dim, 0, padding[dim].first));
-        build_opts.add_option(generate_pad_string(dim, 1, padding[dim].second));
-    }
-
-    while(dim < TensorShape::num_max_dimensions)
-    {
-        build_opts.add_option(generate_pad_string(dim, 0, 0));
-        build_opts.add_option(generate_pad_string(dim, 1, 0));
-        dim++;
-    }
-}
-
 } // namespace
 
 CLCopyKernel::CLCopyKernel()
@@ -155,15 +64,17 @@ CLCopyKernel::CLCopyKernel()
 {
 }
 
-void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, Window *output_window)
+void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output, Window *output_window)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, output_window);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, output_window);
 }
 
-void CLCopyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, Window *output_window)
+void CLCopyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Window *output_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, output_window));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), output_window));
+
+    auto padding_info = get_padding_info({ input, output });
 
     _input  = input;
     _output = output;
@@ -172,79 +83,51 @@ void CLCopyKernel::configure(const CLCompileContext &compile_context, const ICLT
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
 
-    std::pair<Status, Window> win_config;
-
-    const unsigned int vec_size_x = 16 / input->info()->element_size();
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*(output->info()), *(input->info()));
 
-    if(padding.empty())
-    {
-        // Configure window
-        win_config = validate_and_configure_window(input->info(), output->info(), output_window);
+    // Configure window
+    const unsigned int vec_size_x = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
-        if(output_window != nullptr)
-        {
-            _has_output_window        = true;
-            _output_window            = Window(*output_window);
-            const int  width_x        = output_window->num_iterations(0);
-            const bool multi_access_x = width_x >= static_cast<int32_t>(vec_size_x);
-            const bool remainder_x    = width_x % vec_size_x > 0;
+    const Window win_config = calculate_max_window(*(input->info()), Steps(vec_size_x));
 
-            if(multi_access_x)
-            {
-                _output_window.set(Window::DimX, Window::Dimension(output_window->x().start(), ceil_to_multiple(output_window->x().end(), vec_size_x), vec_size_x));
-                win_config.second.set(Window::DimX, Window::Dimension(win_config.second.x().start(), ceil_to_multiple(win_config.second.x().end(), vec_size_x), vec_size_x));
-            }
+    if(output_window != nullptr)
+    {
+        _has_output_window             = true;
+        _output_window                 = Window(*output_window);
+        const int  width_x             = output_window->num_iterations(0);
+        const int  vec_size_x_leftover = width_x % vec_size_x;
+        const bool multi_access_x      = width_x >= static_cast<int32_t>(vec_size_x);
 
-            build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-            build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(width_x - vec_size_x, 0)));
-        }
-        else
+        if(multi_access_x)
         {
-            build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+            _output_window.set(Window::DimX, Window::Dimension(output_window->x().start(), ceil_to_multiple(output_window->x().end(), vec_size_x), vec_size_x));
         }
 
-        // Build kernel
-        _kernel = create_kernel(compile_context, "copy_tensor", build_opts.options());
+        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
     }
     else
     {
-        build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-
-        // Add compile time options
-        add_padding_as_build_options(padding, build_opts);
+        const int width_x             = input->info()->tensor_shape().x();
+        const int vec_size_x_leftover = width_x % vec_size_x;
 
-        // If we are padding in the fourth dimension the kernel needs to know the depth of the
-        // different cubes
-        if(padding.size() == 4)
-        {
-            const size_t depth = input->info()->tensor_shape()[2];
-            build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth));
-        }
+        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
+    }
 
-        // Build kernel
-        _kernel = create_kernel(compile_context, "copy_pad_tensor", build_opts.options());
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
 
-        // Configure window
-        win_config = validate_and_configure_window_with_padding(input->info(), output->info(), padding);
-    }
+    // Build kernel
+    _kernel = create_kernel(compile_context, "copy_tensor", build_opts.options());
 
     // Validate and set the window
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    ICLKernel::configure_internal(win_config);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, const PaddingList &padding, Window *output_window)
+Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, Window *output_window)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, output_window));
-
-    if(padding.empty())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), output_window).first);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(input->clone().get(), output->clone().get(), padding).first);
-    }
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, output_window));
 
     return Status{};
 }
diff --git a/arm_compute/core/CL/kernels/CLCopyKernel.h b/src/core/CL/kernels/CLCopyKernel.h
similarity index 84%
rename from arm_compute/core/CL/kernels/CLCopyKernel.h
rename to src/core/CL/kernels/CLCopyKernel.h
index 11a6d54e90..9a20b88884 100644
--- a/arm_compute/core/CL/kernels/CLCopyKernel.h
+++ b/src/core/CL/kernels/CLCopyKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLCOPYKERNEL_H
 #define ARM_COMPUTE_CLCOPYKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
@@ -49,29 +49,26 @@ class CLCopyKernel : public ICLKernel
      *
      * @param[in]  input         Source tensor. Data types supported: All.
      * @param[out] output        Destination tensor. Data types supported: same as @p input.
-     * @param[in]  padding       (Optional) Padding to be applied to the input tensor
      * @param[in]  output_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
+    void configure(const ICLTensor *input, ICLTensor *output, Window *output_window = nullptr);
     /** Initialize the kernel's input, output.
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data types supported: All.
      * @param[out] output          Destination tensor. Data types supported: same as @p input.
-     * @param[in]  padding         (Optional) Padding to be applied to the input tensor
      * @param[in]  output_window   (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Window *output_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCopyKernel
      *
      * @param[in] input         Source tensor info. Data types supported: All.
      * @param[in] output        Destination tensor info. Data types supported: same as @p input.
-     * @param[in] padding       (Optional) Padding to be applied to the input tensor
      * @param[in] output_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, Window *output_window = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLCropKernel.cpp b/src/core/CL/kernels/CLCropKernel.cpp
index f828162177..9cf15ff93b 100644
--- a/src/core/CL/kernels/CLCropKernel.cpp
+++ b/src/core/CL/kernels/CLCropKernel.cpp
@@ -21,21 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLCropKernel.h"
+#include "src/core/CL/kernels/CLCropKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/helpers/bit_ops.h"
-#include "arm_compute/core/utils/helpers/tensor_transform.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/StringSupport.h"
 
 #include <map>
diff --git a/arm_compute/core/CL/kernels/CLCropKernel.h b/src/core/CL/kernels/CLCropKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLCropKernel.h
rename to src/core/CL/kernels/CLCropKernel.h
index 91d70e6c1b..cbfada58ab 100644
--- a/arm_compute/core/CL/kernels/CLCropKernel.h
+++ b/src/core/CL/kernels/CLCropKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLCROPKERNEL_H
 #define ARM_COMPUTE_CLCROPKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index e8f12d5d9d..d01a00d61e 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -21,16 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
rename to src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index 84265a2329..e0d1322341 100644
--- a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H
 #define ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index 69730346fe..ea22ec0067 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
rename to src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
index 688c943593..ce354fa86f 100644
--- a/arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H
 #define ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
+#include "src/core/CL/ICLSimpleKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index 5978a0223f..eb5bfc2d86 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
@@ -21,16 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
@@ -38,24 +38,6 @@ namespace arm_compute
 {
 namespace
 {
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output)
-{
-    ARM_COMPUTE_UNUSED(depth_offset);
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-    // The window needs to be based on input as we copy all the depths of input
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-    win.set(Window::DimZ, Window::Dimension(0, input->tensor_shape().z(), 1));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -82,14 +64,17 @@ void CLDepthConcatenateLayerKernel::configure(const CLCompileContext &compile_co
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, depth_offset, output));
 
+    auto padding_info = get_padding_info({ input, output });
+
     _depth_offset = depth_offset;
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
 
     // Add build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
     if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
     {
         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
@@ -105,13 +90,14 @@ void CLDepthConcatenateLayerKernel::configure(const CLCompileContext &compile_co
     _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input, depth_offset, output);
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    ICLKernel::configure_internal(std::get<1>(win_config));
+    auto win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    win.set(Window::DimZ, Window::Dimension(0, input->tensor_shape().z(), 1));
+    ICLKernel::configure_internal(win);
 
     // Set output valid region
     output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLDepthConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input,
@@ -119,7 +105,6 @@ Status CLDepthConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *i
                                                const arm_compute::ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, depth_offset, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), depth_offset, output->clone().get()).first);
     return Status{};
 }
 
diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
rename to src/core/CL/kernels/CLDepthConcatenateLayerKernel.h
index d8493bc5d8..6c73bd4bf4 100644
--- a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.h
@@ -25,8 +25,8 @@
 #ifndef ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H
 #define ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
index 11297e7901..c98d66f390 100644
--- a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
@@ -21,17 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
@@ -81,21 +82,27 @@ void CLDepthConvertLayerKernel::configure(const CLCompileContext &compile_contex
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
+    _input  = input;
+    _output = output;
+
     // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
     set_shape_if_empty(*output->info(), input->info()->tensor_shape());
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
 
+    auto padding_info = get_padding_info({ input, output });
+
     // Get data sizes
     const size_t input_size  = data_size_from_type(input->info()->data_type());
     const size_t output_size = data_size_from_type(output->info()->data_type());
 
     // Get number of elements to process per iterations
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
     // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
@@ -111,14 +118,25 @@ void CLDepthConvertLayerKernel::configure(const CLCompileContext &compile_contex
     unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
     _kernel.setArg(idx++, shift);
 
+    // Since we have a leftover vector size calculated using the input tensor shape, it is required to
+    // have the input region equal to the tensor shape
+    ValidRegion input_valid_region = input->info()->valid_region();
+    input->info()->set_valid_region(ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
+
     // Configure kernel
-    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
 
     // Collapse window
     const Window &full_window      = window();
     Window        collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
     ICLKernel::configure_internal(collapsed_window);
 
+    // Restore the valid region
+    input->info()->set_valid_region(input_valid_region);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
     // Set config_id for enabling LWS tuning
     _config_id = kernel_name;
     _config_id += "_";
diff --git a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h b/src/core/CL/kernels/CLDepthConvertLayerKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
rename to src/core/CL/kernels/CLDepthConvertLayerKernel.h
index 7f9696d835..8b511c6707 100644
--- a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
+++ b/src/core/CL/kernels/CLDepthConvertLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H
 #define ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple3DKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLSimple3DKernel.h"
 
 #include <cstdint>
 
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index b16c961547..8946f2a713 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -74,9 +76,11 @@ void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = compute_depth_to_space_shape(input->info(), block_shape);
+    TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
+    auto padding_info = get_padding_info({ input, output });
+
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
 
     _input       = input;
@@ -97,6 +101,8 @@ void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
     ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLDepthToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
diff --git a/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h
rename to src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
index 1bd1e8e763..1f7f77b569 100644
--- a/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
 #define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index 066e9a5a40..25d0d2799b 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -21,21 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -316,9 +316,11 @@ void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const CLCompileContext
 
         if(act_info.enabled())
         {
-            const int a_val = quantize_qasymm8(act_info.a(), oq_info);
-            const int b_val = quantize_qasymm8(act_info.b(), oq_info);
-            const int o1    = oq_info.offset;
+            int a_val{};
+            int b_val{};
+            std::tie(b_val, a_val) = get_quantized_activation_min_max(act_info, input->info()->data_type(), oq_info);
+
+            const int o1 = oq_info.offset;
 
             build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
             build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
rename to src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
index 93e7e374b0..45b5869676 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
 #define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
 
-#include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
+#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 0930fee712..f553fd1849 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -21,21 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -124,37 +124,33 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
                                                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
                                                         ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
 {
-    const size_t weights_width  = 3;
-    const size_t weights_height = 3;
-
-    // Get convolved dimensions
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
-                                         *input, TensorInfo(TensorShape(weights_width, weights_height), 1, weights->data_type()).set_data_layout(DataLayout::NCHW), conv_info, depth_multiplier, dilation);
+    ARM_COMPUTE_UNUSED(weights);
+    ARM_COMPUTE_UNUSED(depth_multiplier);
 
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
+    const bool   is_stride_1_dilation_1           = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1) && dilation.x() == 1 && dilation.y() == 1);
+    unsigned int num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
 
-    const bool is_qasymm              = is_data_type_quantized_asymmetric(input->data_type());
-    const bool is_stride_1_dilation_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1) && dilation.x() == 1 && dilation.y() == 1);
+    Window win{};
+    Status err{};
 
-    const unsigned int num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
-    const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->element_size());
-    const unsigned int num_rows_read_per_iteration      = num_rows_processed_per_iteration + 2;
-    const unsigned int num_rows_written_per_iteration   = std::ceil(num_rows_processed_per_iteration / static_cast<float>(conv_info.stride().first));
+    if(is_data_type_quantized_asymmetric(input->data_type()))
+    {
+        const unsigned int num_elems_accessed_per_iteration = 4;
+        const unsigned int num_rows_read_per_iteration      = num_rows_processed_per_iteration + 2;
+        const unsigned int num_rows_written_per_iteration   = std::ceil(num_rows_processed_per_iteration / static_cast<float>(conv_info.stride().first));
 
-    BorderSize border_size;
-    border_size = BorderSize(conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
+        BorderSize border_size;
+        border_size = BorderSize(conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
 
-    // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(num_elems_accessed_per_iteration, num_rows_written_per_iteration));
+        // Configure kernel window
+        win = calculate_max_window(*output, Steps(num_elems_accessed_per_iteration, num_rows_written_per_iteration));
 
-    AccessWindowStatic input_access(input, 0, -border_size.top, ceil_to_multiple(input->dimension(0), num_elems_accessed_per_iteration),
-                                    ceil_to_multiple(input->dimension(1) + border_size.bottom, num_rows_read_per_iteration));
-    AccessWindowRectangle output_access(output, 0, 0, num_elems_accessed_per_iteration, num_rows_written_per_iteration);
+        AccessWindowStatic input_access(input, 0, -border_size.top, ceil_to_multiple(input->dimension(0), num_elems_accessed_per_iteration),
+                                        ceil_to_multiple(input->dimension(1) + border_size.bottom, num_rows_read_per_iteration));
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_accessed_per_iteration, num_rows_written_per_iteration);
 
-    bool window_changed = false;
+        bool window_changed = false;
 
-    if(is_qasymm)
-    {
         if((output_multipliers != nullptr) && (output_shifts != nullptr))
         {
             AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_accessed_per_iteration);
@@ -166,27 +162,28 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
             Status err = ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "output_multipliers and output_shifts must be non-nullptr for quantized input");
             return std::make_pair(err, win);
         }
+
+        if(bias != nullptr)
+        {
+            AccessWindowHorizontal bias_access(bias, 0, num_elems_accessed_per_iteration);
+            window_changed = window_changed || update_window_and_padding(win, bias_access);
+        }
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+        err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     }
     else
     {
-        AccessWindowStatic weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_accessed_per_iteration), weights->dimension(1));
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-    }
-
-    if(bias != nullptr)
-    {
-        AccessWindowHorizontal bias_access(bias, 0, num_elems_accessed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, bias_access);
+        unsigned int num_elems_accessed_per_iteration = adjust_vec_size(4 / input->element_size(), input->dimension(0));
+        win                                           = calculate_max_window(*output, Steps(num_elems_accessed_per_iteration, num_rows_processed_per_iteration));
     }
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
 CLDepthwiseConvolutionLayer3x3NHWCKernel::CLDepthwiseConvolutionLayer3x3NHWCKernel()
-    : _num_rows_processed_per_iteration(1), _num_planes_processed_per_iteration(1)
+    : _num_planes_processed_per_iteration(1)
 {
 }
 
@@ -211,15 +208,16 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const CLCompileContext
                                                   conv_info, depth_multiplier, act_info, dilation,
                                                   (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
                                                   (output_shifts != nullptr) ? output_shifts->info() : nullptr));
+
+    auto padding_info = get_padding_info({ input, weights, biases, output });
+
     auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
                                                     conv_info, depth_multiplier, dilation,
                                                     (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
                                                     (output_shifts != nullptr) ? output_shifts->info() : nullptr);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-    const bool is_stride_1            = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
-    const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
 
+    const bool is_stride_1              = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool is_stride_1_dilation_1   = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
     const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type());
     const bool is_dot8_supported        = dot8_supported(CLKernelLibrary::get().get_device()) && !is_quantized_per_channel;
 
@@ -228,31 +226,37 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const CLCompileContext
     _weights                            = weights;
     _biases                             = biases;
     _conv_stride_y                      = conv_info.stride().second;
-    _num_rows_processed_per_iteration   = is_stride_1_dilation_1 ? 2 : 1;
     _num_planes_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
     _output_multipliers                 = output_multipliers;
     _output_shifts                      = output_shifts;
     _is_quantized                       = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1
-    if(is_dot8_supported && _is_quantized)
+    if(_is_quantized)
     {
-        _num_planes_processed_per_iteration = 1;
-    }
+        _border_size = BorderSize(is_stride_1 ? 0 : conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
 
-    _border_size = BorderSize(_is_quantized && is_stride_1 ? 0 : conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
+        // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1
+        if(is_dot8_supported)
+        {
+            _num_planes_processed_per_iteration = 1;
+        }
+    }
 
-    const unsigned int num_elems_accessed_per_iteration = _is_quantized ? 4 : (8 / input->info()->element_size());
+    unsigned int num_elems_accessed_per_iteration = _is_quantized ? 4 : adjust_vec_size(4 / input->info()->element_size(), input->info()->dimension(0));
+    unsigned int num_rows_processed_per_iteration = is_stride_1_dilation_1 ? 2 : 1;
 
     CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
-    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration));
+    build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
     build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
     build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
     build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
-    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_accessed_per_iteration));
+    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
+    build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
+                             "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
 
     if(_is_quantized)
     {
@@ -278,9 +282,11 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const CLCompileContext
 
         if(act_info.enabled())
         {
-            const int a_val = quantize_qasymm8(act_info.a(), oq_info);
-            const int b_val = quantize_qasymm8(act_info.b(), oq_info);
-            const int o1    = oq_info.offset;
+            int a_val{};
+            int b_val{};
+            std::tie(b_val, a_val) = get_quantized_activation_min_max(act_info, input->info()->data_type(), oq_info);
+
+            const int o1 = oq_info.offset;
 
             build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
             build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
@@ -291,7 +297,6 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const CLCompileContext
             build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
         }
 
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
         build_opts.add_option("-DWEIGHTS_TYPE=" + get_cl_type_from_data_type(weights->info()->data_type()));
         build_opts.add_option("-DWEIGHTS_PROMOTED_TYPE=" + get_cl_promoted_type_from_data_type(weights->info()->data_type()));
     }
@@ -299,22 +304,23 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const CLCompileContext
     {
         build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
         build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
     }
 
     if(is_stride_1_dilation_1)
     {
-        build_opts.add_option("-DNUM_ROWS_PROCESSED=" + support::cpp11::to_string(_num_rows_processed_per_iteration));
+        build_opts.add_option("-DNUM_ROWS_PROCESSED=" + support::cpp11::to_string(num_rows_processed_per_iteration));
         build_opts.add_option("-DNUM_PLANES_PROCESSED=" + support::cpp11::to_string(_num_planes_processed_per_iteration));
+        build_opts.add_option("-DDST_DIM_1=" + support::cpp11::to_string(_output->info()->dimension(1)));
         build_opts.add_option("-DDST_DIM_2=" + support::cpp11::to_string(_output->info()->dimension(2)));
+        build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string((input->info()->dimension(1) + conv_info.pad_left() + conv_info.pad_right()) % num_rows_processed_per_iteration));
     }
     else
     {
         build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
         build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
+        build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+        build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
     }
-    build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
-                             "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
 
     std::string kernel_name;
     // Create kernel
@@ -331,12 +337,11 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const CLCompileContext
         kernel_name += (is_stride_1_dilation_1 ? "_stride1" : "");
     }
 
-    build_opts.add_option_if(input->info()->data_type() == DataType::F16, "-DIS_F16");
-    build_opts.add_option_if(input->info()->data_type() == DataType::F32, "-DIS_F32");
-
     ICLKernel::configure_internal(win_config.second);
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
+    ARM_COMPUTE_ERROR_ON(!_is_quantized && has_padding_changed(padding_info));
+
     // Set config_id for enabling LWS tuning
     _config_id = kernel_name;
     _config_id += "_";
@@ -364,7 +369,6 @@ Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *inp
                                                               (output_multipliers != nullptr) ? output_multipliers->clone().get() : nullptr,
                                                               (output_shifts != nullptr) ? output_shifts->clone().get() : nullptr)
                                 .first);
-
     return Status{};
 }
 
@@ -373,23 +377,11 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    // Collapse window
-    Window       window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    const size_t total_batches    = _input->info()->tensor_shape().total_size_upper(3);
+    const size_t total_batches = _input->info()->tensor_shape().total_size_upper(3);
 
-    Window win = window_collapsed;
+    Window win = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)) * total_batches, 1));
 
-    // Create input window and adjust
-    Window win_in = win;
-    win_in.set_dimension_step(Window::DimY, _num_rows_processed_per_iteration);
-    win_in.set_dimension_step(Window::DimZ, _conv_stride_y);
-
-    ARM_COMPUTE_ERROR_ON((win_in.y().step() < window.y().step()) || (win_in.z().step() < window.z().step()));
-
-    Window slice_in  = win_in.first_slice_window_4D();
-    Window slice_out = win.first_slice_window_4D();
-
     unsigned int idx = 2 * num_arguments_per_4D_tensor() + (_is_quantized ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
 
     if(_is_quantized)
@@ -409,60 +401,64 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com
         add_1D_tensor_argument(idx, _biases, win_biases);
     }
 
-    // Calculate the max_offset.
-    // max_offset is the offset for the last NOT valid value in the Z dimension (spatial dimension Y for NHWC)
-    //  |******************|
-    //  |     pad_top      |
-    //  |******************|
-    //  |                  |
-    //  |      plane0      |
-    //  |      batch0      |
-    //  |__________________|
-    //  |******************|       Batch 0
-    //  |    pad_bottom    |
-    //  |     pad_top      |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |      batch0      |
-    //  |__________________|-----> max_offset
-    //  |******************|
-    //  |    pad_bottom    |
-    //  |     pad_top      |
-    //  |******************|
-    //  |                  |
-    //  |      plane0      |
-    //  |      batch1      |
-    //  |__________________|
-    //  |******************|       Batch 1
-    //  |    pad_bottom    |
-    //  |     pad_top      |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |      batch1      |
-    //  |__________________|
-    //  |     pad_bottom   |
-    //  |******************|
-    const int max_offset = _input->info()->strides_in_bytes().z() * _input->info()->dimension(2) - (_input->info()->padding().bottom + _input->info()->padding().top) *
-                           _input->info()->strides_in_bytes().y();
-    _kernel.setArg(idx, max_offset);
+    if(_is_quantized)
+    {
+        // Calculate the max_offset.
+        // max_offset is the offset for the last NOT valid value in the Z dimension (spatial dimension Y for NHWC)
+        //  |******************|
+        //  |     pad_top      |
+        //  |******************|
+        //  |                  |
+        //  |      plane0      |
+        //  |      batch0      |
+        //  |__________________|
+        //  |******************|       Batch 0
+        //  |    pad_bottom    |
+        //  |     pad_top      |
+        //  |******************|
+        //  |                  |
+        //  |      plane1      |
+        //  |      batch0      |
+        //  |__________________|-----> max_offset
+        //  |******************|
+        //  |    pad_bottom    |
+        //  |     pad_top      |
+        //  |******************|
+        //  |                  |
+        //  |      plane0      |
+        //  |      batch1      |
+        //  |__________________|
+        //  |******************|       Batch 1
+        //  |    pad_bottom    |
+        //  |     pad_top      |
+        //  |******************|
+        //  |                  |
+        //  |      plane1      |
+        //  |      batch1      |
+        //  |__________________|
+        //  |     pad_bottom   |
+        //  |******************|
+        const int max_offset = _input->info()->strides_in_bytes().z() * _input->info()->dimension(2) - (_input->info()->padding().bottom + _input->info()->padding().top) *
+                               _input->info()->strides_in_bytes().y();
+        _kernel.setArg(idx, max_offset);
+    }
 
+    Window slice = win.first_slice_window_4D();
     do
     {
         unsigned int idx = 0;
-        add_4D_tensor_argument(idx, _input, slice_in);
-        add_4D_tensor_argument(idx, _output, slice_out);
+        add_4D_tensor_argument(idx, _input, slice);
+        add_4D_tensor_argument(idx, _output, slice);
         if(_is_quantized)
         {
-            add_2D_tensor_argument(idx, _weights, slice_out);
+            add_2D_tensor_argument(idx, _weights, slice);
         }
         else
         {
-            add_3D_tensor_argument(idx, _weights, slice_out);
+            add_3D_tensor_argument(idx, _weights, slice);
         }
-        enqueue(queue, *this, slice_out, lws_hint());
+        enqueue(queue, *this, slice, lws_hint());
     }
-    while(win.slide_window_slice_4D(slice_out) && win_in.slide_window_slice_4D(slice_in));
+    while(win.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
rename to src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
index 4ca6c0bf4a..ce0bf5ceb3 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
 #define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
 
-#include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
+#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
 
 namespace arm_compute
 {
@@ -107,7 +107,6 @@ class CLDepthwiseConvolutionLayer3x3NHWCKernel : public ICLDepthwiseConvolutionL
     BorderSize border_size() const override;
 
 private:
-    unsigned int _num_rows_processed_per_iteration;
     unsigned int _num_planes_processed_per_iteration;
 };
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index a538ab51cb..c34018a000 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -21,21 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -124,60 +123,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                                                        const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
-{
-    ARM_COMPUTE_UNUSED(dwc_info);
-
-    // Get convolved dimensions
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
-
-    const unsigned int n0 = dwc_weights_info.n0;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(n0));
-
-    // The following access windows are only valid in case of NHWC and because n0 must unit in case depth_multiplier > 1
-    AccessWindowHorizontal input_access(input, 0, n0);
-    AccessWindowHorizontal weights_access(weights, 0, n0);
-    AccessWindowHorizontal output_access(output, 0, n0);
-
-    bool window_changed = false;
-
-    if(bias != nullptr)
-    {
-        AccessWindowHorizontal bias_access(bias, 0, n0);
-        window_changed = update_window_and_padding(win, input_access, weights_access, bias_access, output_access);
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-    }
-
-    if(is_data_type_quantized(input->data_type()))
-    {
-        if((output_multipliers != nullptr) && (output_shifts != nullptr))
-        {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, n0);
-            AccessWindowHorizontal output_shifts_access(output_shifts, 0, n0);
-            window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
-        }
-        else
-        {
-            Status err = ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "output_multipliers and output_shifts must be non-nullptr for quantized input");
-            return std::make_pair(err, win);
-        }
-    }
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel()
@@ -209,10 +154,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
                                                   dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
                                                   (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr));
 
-    auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-                                                    dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
-                                                    (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    auto padding_info = get_padding_info({ input, output });
+
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info, depth_multiplier, dilation);
+    auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
 
     _input              = input;
     _output             = output;
@@ -223,10 +168,7 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     _output_shifts      = output_shifts;
     _is_quantized       = is_data_type_quantized(input->info()->data_type());
 
-    const size_t idx_w          = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h          = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t weights_width  = weights->info()->dimension(idx_w);
-    const size_t weights_height = weights->info()->dimension(idx_h);
+    const unsigned int n0 = adjust_vec_size(dwc_weights_info.n0, input->info()->dimension(0));
 
     CLBuildOptions build_opts;
     build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
@@ -234,17 +176,18 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(dwc_info.activation_info.activation())));
     build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(dwc_weights_info.n0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
     build_opts.add_option("-DSRC_DIM1=" + support::cpp11::to_string(_input->info()->dimension(1)));
     build_opts.add_option("-DSRC_DIM2=" + support::cpp11::to_string(_input->info()->dimension(2)));
-    build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(weights_width));
-    build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(weights_height));
+    build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(weights->info()->dimension(1)));
+    build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(weights->info()->dimension(2)));
     build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
     build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
     build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
     build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
     build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
     build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(_input->info()->dimension(0) % n0));
 
     std::string kernel_name = (_is_quantized) ? "dwc_MxN_native_quantized8_nhwc" : "dwc_MxN_native_fp_nhwc";
 
@@ -269,9 +212,11 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
 
         if(dwc_info.activation_info.enabled())
         {
-            const int a_val = quantize_qasymm8(dwc_info.activation_info.a(), oq_info);
-            const int b_val = quantize_qasymm8(dwc_info.activation_info.b(), oq_info);
-            const int o1    = oq_info.offset;
+            int a_val{};
+            int b_val{};
+            std::tie(b_val, a_val) = get_quantized_activation_min_max(dwc_info.activation_info, input->info()->data_type(), oq_info);
+
+            const int o1 = oq_info.offset;
 
             build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
             build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
@@ -291,9 +236,13 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
         build_opts.add_option_if(dwc_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(dwc_info.activation_info.b()));
     }
 
-    ICLKernel::configure_internal(win_config.second);
+    Window win = calculate_max_window(*(output->info()), Steps(n0));
+    ICLKernel::configure_internal(win);
+
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
     // Set config_id for enabling LWS tuning
     _config_id = kernel_name;
     _config_id += "_";
@@ -317,13 +266,6 @@ Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *inpu
                                                          unsigned int depth_multiplier, const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(),
-                                                              biases != nullptr ? biases->clone().get() : nullptr,
-                                                              output->clone().get(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr)
-                                .first);
-
     return Status{};
 }
 
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
rename to src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 03a0106cc9..325f4e7067 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 #define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include "arm_compute/core/KernelDescriptors.h"
 
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
index 07f25a80cf..b10c23bde9 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
@@ -21,21 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
rename to src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
index 51aaf17600..650fe9a11b 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H
 #define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index 72eac858ad..3723c651fe 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
@@ -21,17 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLDequantizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -52,22 +53,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 
     return Status{};
 }
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps());
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
-
-    // CLDequantizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
-    Coordinates coord;
-    coord.set_num_dimensions(output->num_dimensions());
-    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
-    return std::make_tuple(Status{}, win);
-}
 } // namespace
 
 CLDequantizationLayerKernel::CLDequantizationLayerKernel()
@@ -83,6 +68,12 @@ void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *o
 void CLDequantizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::F32);
+
+    auto padding_info = get_padding_info({ input, output });
+
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
@@ -92,15 +83,6 @@ void CLDequantizationLayerKernel::configure(const CLCompileContext &compile_cont
     const int  output_width_x = output->info()->tensor_shape().x();
     const bool multi_access_x = (output_width_x / vec_size_x > 0);
 
-    // Create and update the window (if needed)
-    Window win = calculate_max_window(*output->info());
-    if(multi_access_x)
-    {
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
     const bool  is_quantized_per_channel = is_data_type_quantized_per_channel(input->info()->data_type());
     std::string kernel_name              = "dequantization_layer";
 
@@ -126,12 +108,25 @@ void CLDequantizationLayerKernel::configure(const CLCompileContext &compile_cont
 
     // Create kernel name
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info());
+    if(multi_access_x)
+    {
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    // Set output valid region
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
     return Status{};
 }
 
diff --git a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h b/src/core/CL/kernels/CLDequantizationLayerKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
rename to src/core/CL/kernels/CLDequantizationLayerKernel.h
index 7a582da132..5579b5bc71 100644
--- a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
index ab5f9dab76..5ff11362cc 100644
--- a/src/core/CL/kernels/CLDerivativeKernel.cpp
+++ b/src/core/CL/kernels/CLDerivativeKernel.cpp
@@ -21,16 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
+#include "src/core/CL/kernels/CLDerivativeKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/arm_compute/core/CL/kernels/CLDerivativeKernel.h b/src/core/CL/kernels/CLDerivativeKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLDerivativeKernel.h
rename to src/core/CL/kernels/CLDerivativeKernel.h
index b49905a5e6..14dd05d084 100644
--- a/arm_compute/core/CL/kernels/CLDerivativeKernel.h
+++ b/src/core/CL/kernels/CLDerivativeKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLDERIVATIVEKERNEL_H
 #define ARM_COMPUTE_CLDERIVATIVEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp
index ae948314a3..cac5bc1c72 100644
--- a/src/core/CL/kernels/CLDilateKernel.cpp
+++ b/src/core/CL/kernels/CLDilateKernel.cpp
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
+#include "src/core/CL/kernels/CLDilateKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/arm_compute/core/CL/kernels/CLDilateKernel.h b/src/core/CL/kernels/CLDilateKernel.h
similarity index 97%
rename from arm_compute/core/CL/kernels/CLDilateKernel.h
rename to src/core/CL/kernels/CLDilateKernel.h
index 747f8fa5ca..591ec8ccfc 100644
--- a/arm_compute/core/CL/kernels/CLDilateKernel.h
+++ b/src/core/CL/kernels/CLDilateKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLDILATEKERNEL_H
 #define ARM_COMPUTE_CLDILATEKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index d5d808a80f..a642eabc4e 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -21,21 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+#include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
rename to src/core/CL/kernels/CLDirectConvolutionLayerKernel.h
index 5281a0c306..5cd674f631 100644
--- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
index c8c7fb03b8..0a641adcd2 100644
--- a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
+++ b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
@@ -21,28 +21,43 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output)
+Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output, const ElementWiseUnary op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32);
+    if(op == ElementWiseUnary::LOGICAL_NOT)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::U8);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32);
+    }
 
     // Validate in case of configured output
     if(output.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::F16, DataType::F32);
+        if(op == ElementWiseUnary::LOGICAL_NOT)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::U8);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::F16, DataType::F32);
+        }
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &output);
     }
@@ -59,21 +74,16 @@ void CLElementWiseUnaryLayerKernel::configure(const ITensorInfo *input, ITensorI
 void CLElementWiseUnaryLayerKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const ElementWiseUnary &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input, *output));
+
+    auto padding_info = get_padding_info({ input, output });
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input, *output, op));
 
     const std::string kernel_name    = "elementwise_unary";
     const int         vec_size_x     = 16 / output->element_size();
     const int         output_width_x = output->tensor_shape().x();
     const bool        multi_access_x = (output_width_x / vec_size_x > 0);
 
-    Window win = calculate_max_window(*output);
-    if(multi_access_x)
-    {
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-    ICLKernel::configure_internal(win);
-
     // Set kernel build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
@@ -102,19 +112,33 @@ void CLElementWiseUnaryLayerKernel::configure(const CLCompileContext &compile_co
         case ElementWiseUnary::ROUND:
             build_opts.add_option("-DOPERATION=round_op");
             break;
+        case ElementWiseUnary::LOGICAL_NOT:
+            build_opts.add_option("-DOPERATION=logical_not_op");
+            break;
         default:
             ARM_COMPUTE_ERROR("Not implemented");
     }
 
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output);
+    if(multi_access_x)
+    {
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLElementWiseUnaryLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ElementWiseUnary &op)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output, op));
 
     return Status{};
 }
diff --git a/arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h
similarity index 97%
rename from arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h
rename to src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h
index 82cd953b68..95b5872796 100644
--- a/arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h
+++ b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_CLELEMENTWISEUNARYLAYERKERNEL_H
 #define ARM_COMPUTE_CLELEMENTWISEUNARYLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLSimpleKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
index ec33500f20..47439e15ab 100644
--- a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
+++ b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
@@ -21,12 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/common/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 #include "support/StringSupport.h"
 #include <map>
 
@@ -34,7 +37,7 @@ namespace arm_compute
 {
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 16;
+constexpr unsigned int vector_size_byte_opencl = 16;
 
 std::map<ArithmeticOperation, std::string> supported_arithmetic_ops =
 {
@@ -150,10 +153,15 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &i
 {
     CLBuildOptions build_opts;
 
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output.element_size(), output.dimension(0));
+
     build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1.data_type()));
     build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2.data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output.data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(input1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(input2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output.dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DOP=" + operation_string);
     if(is_data_type_quantized(input1.data_type()))
     {
@@ -171,31 +179,17 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &i
     return build_opts;
 }
 
-std::pair<Status, Window> configure_window_arithmetic_common(const ValidRegion &valid_region, ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+std::pair<Status, Window> configure_window_arithmetic_common(ITensorInfo &output)
 {
-    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-    Window win_input1 = win.broadcast_if_dimension_le_one(input1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(input2);
-
-    AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    output_access.set_valid_region(win, valid_region);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output.element_size(), output.dimension(0));
+    Window             win                               = calculate_max_window(output, Steps(num_elems_processed_per_iteration));
+    return std::make_pair(Status{}, win);
 }
 
 std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
 {
     const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
-    const TensorShape &out_shape    = broadcast_pair.first;
-    const ValidRegion &valid_region = broadcast_pair.second;
+    const TensorShape &out_shape = broadcast_pair.first;
 
     set_shape_if_empty(output, out_shape);
 
@@ -224,16 +218,27 @@ std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators
         set_data_type_if_unknown(output, DataType::QSYMM16);
     }
 
-    return configure_window_arithmetic_common(valid_region, input1, input2, output);
+    return configure_window_arithmetic_common(output);
+}
+
+std::pair<Status, Window> validate_and_configure_window_for_logical_binary_operators(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+{
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
+    const TensorShape &out_shape = broadcast_pair.first;
+
+    set_shape_if_empty(output, out_shape);
+    set_data_type_if_unknown(output, DataType::U8);
+
+    // The arithmetic utility functions can be share
+    return configure_window_arithmetic_common(output);
 }
 
 std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
 {
     const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
-    const TensorShape &out_shape    = broadcast_pair.first;
-    const ValidRegion &valid_region = broadcast_pair.second;
+    const TensorShape &out_shape = broadcast_pair.first;
     auto_init_if_empty(output, out_shape, 1, input1.data_type());
-    return configure_window_arithmetic_common(valid_region, input1, input2, output);
+    return configure_window_arithmetic_common(output);
 }
 } // namespace
 
@@ -313,28 +318,74 @@ void CLElementwiseOperationKernel::run_op(ITensorPack &tensors, const Window &wi
     Window slice        = collapsed.first_slice_window_3D();
     Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
     Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
     do
     {
         unsigned int idx = 0;
-
         add_3D_tensor_argument(idx, src_0, slice_input1);
         add_3D_tensor_argument(idx, src_1, slice_input2);
         add_3D_tensor_argument(idx, dst, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
 
-BorderSize CLElementwiseOperationKernel::border_size() const
+/** Logical binary */
+void CLLogicalBinaryKernel::configure(const CLCompileContext &compile_context, kernels::LogicalOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLLogicalBinaryKernel::validate(op, input1, input2, output));
+    _op = op;
+    configure_common(compile_context, input1, input2, output);
+}
+
+Status CLLogicalBinaryKernel::validate(kernels::LogicalOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_UNUSED(op);
+    ARM_COMPUTE_ASSERT(op != kernels::LogicalOperation::Unknown && op != kernels::LogicalOperation::Not);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_logical_binary_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
+
+    return Status{};
+}
+
+std::string CLLogicalBinaryKernel::name()
 {
-    const unsigned int replicateSize = _output->dimension(0) - std::min(_input1->dimension(0), _input2->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize{ 0, border, 0, 0 };
+    switch(_op)
+    {
+        case kernels::LogicalOperation::And:
+            return "AND";
+        case kernels::LogicalOperation::Or:
+            return "OR";
+        case kernels::LogicalOperation::Not:
+        /* fall through */
+        default:
+            ARM_COMPUTE_ASSERT(true);
+    }
+    return "";
+}
+
+std::pair<Status, Window> CLLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+{
+    return validate_and_configure_window_for_logical_binary_operators(input1, input2, output);
+}
+
+CLBuildOptions CLLogicalBinaryKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+{
+    // The arithmetic utility functions can be share
+    return generate_build_options_with_arithmetic_rules(input1, input2, output, name());
+}
+
+std::string CLLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
+{
+    return generate_id_for_tuning_common(kernel_name, input1, output);
 }
 
 /** Arithmetic operations with saturation*/
@@ -351,11 +402,13 @@ void CLSaturatedArithmeticOperationKernel::configure(const CLCompileContext &com
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLSaturatedArithmeticOperationKernel::validate(op, input1, input2, output, policy, act_info));
+    auto padding_info = get_padding_info({ input1, input2, output });
 
     _policy   = policy;
     _op       = op;
     _act_info = act_info;
     configure_common(compile_context, input1, input2, output);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
@@ -407,10 +460,12 @@ void CLArithmeticOperationKernel::configure(const CLCompileContext &compile_cont
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLArithmeticOperationKernel::validate(op, input1, input2, output, act_info));
+    auto padding_info = get_padding_info({ input1, input2, output });
 
     _op       = op;
     _act_info = act_info;
     configure_common(compile_context, input1, input2, output);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
diff --git a/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h b/src/core/CL/kernels/CLElementwiseOperationKernel.h
similarity index 81%
rename from arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h
rename to src/core/CL/kernels/CLElementwiseOperationKernel.h
index b459292161..e24d1564a8 100644
--- a/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h
+++ b/src/core/CL/kernels/CLElementwiseOperationKernel.h
@@ -24,8 +24,9 @@
 #ifndef ARM_COMPUTE_CLELEMENTWISEOPERATIONKERNEL_H
 #define ARM_COMPUTE_CLELEMENTWISEOPERATIONKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/KernelTypes.h"
 
 namespace arm_compute
 {
@@ -55,7 +56,6 @@ class CLElementwiseOperationKernel : public ICLKernel
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
 
 protected:
     /** The name of the operation */
@@ -100,6 +100,49 @@ class CLElementwiseOperationKernel : public ICLKernel
     ITensorInfo       *_output; /**< Destination tensor info */
 };
 
+class CLLogicalBinaryKernel : public CLElementwiseOperationKernel
+{
+public:
+    /** Default constructor */
+    CLLogicalBinaryKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogicalBinaryKernel(const CLLogicalBinaryKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogicalBinaryKernel &operator=(const CLLogicalBinaryKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLogicalBinaryKernel(CLLogicalBinaryKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLogicalBinaryKernel &operator=(CLLogicalBinaryKernel &&) = default;
+    /** Default destructor */
+    ~CLLogicalBinaryKernel() = default;
+    /** Function to configure kernel
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] op              Logical binary operation to be executed.
+     * @param[in] input1          First tensor input info. Data types supported: U8.
+     * @param[in] input2          Second tensor input info. Data types supported: U8.
+     * @param[in] output          Output tensor info. Data types supported: U8.
+     */
+    void configure(const CLCompileContext &compile_context, kernels::LogicalOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if the given configuration is valid for this kernel
+     *
+     * @param[in] op     Logical binary operation to be executed.
+     * @param[in] input1 First tensor input info. Data types supported: U8.
+     * @param[in] input2 Second tensor input info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data types supported: U8.
+     */
+    static Status validate(kernels::LogicalOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+private:
+    // Inherited methods overridden:
+    std::string name() override;
+    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
+    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
+    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
+
+    kernels::LogicalOperation _op{ kernels::LogicalOperation::Unknown };
+};
+
 /** Addition operation */
 class CLSaturatedArithmeticOperationKernel : public CLElementwiseOperationKernel
 {
diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp
index a5eb79f73b..f6d98a5488 100644
--- a/src/core/CL/kernels/CLErodeKernel.cpp
+++ b/src/core/CL/kernels/CLErodeKernel.cpp
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
+#include "src/core/CL/kernels/CLErodeKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/arm_compute/core/CL/kernels/CLErodeKernel.h b/src/core/CL/kernels/CLErodeKernel.h
similarity index 97%
rename from arm_compute/core/CL/kernels/CLErodeKernel.h
rename to src/core/CL/kernels/CLErodeKernel.h
index 620201d625..4da97ae358 100644
--- a/arm_compute/core/CL/kernels/CLErodeKernel.h
+++ b/src/core/CL/kernels/CLErodeKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLERODEKERNEL_H
 #define ARM_COMPUTE_CLERODEKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index 30bca2f0f9..922e50aa73 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h
rename to src/core/CL/kernels/CLFFTDigitReverseKernel.h
index a196c8c64f..2e2f1bdff4 100644
--- a/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
 #define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include "arm_compute/core/KernelDescriptors.h"
 
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 6c36338dae..0f06640b64 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
@@ -21,16 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h
rename to src/core/CL/kernels/CLFFTRadixStageKernel.h
index d6d6067bc4..c3cc510bdd 100644
--- a/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
 #define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include "arm_compute/core/KernelDescriptors.h"
 
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index ac5f2b38c3..4dbe8d2e86 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLFFTScaleKernel.h
rename to src/core/CL/kernels/CLFFTScaleKernel.h
index c6dd176f58..cb007e5307 100644
--- a/arm_compute/core/CL/kernels/CLFFTScaleKernel.h
+++ b/src/core/CL/kernels/CLFFTScaleKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H
 #define ARM_COMPUTE_CLFFTSCALEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include "arm_compute/core/KernelDescriptors.h"
 
diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp
index e71b47228e..7481fd1c27 100644
--- a/src/core/CL/kernels/CLFastCornersKernel.cpp
+++ b/src/core/CL/kernels/CLFastCornersKernel.cpp
@@ -21,16 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+#include "src/core/CL/kernels/CLFastCornersKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/arm_compute/core/CL/kernels/CLFastCornersKernel.h b/src/core/CL/kernels/CLFastCornersKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLFastCornersKernel.h
rename to src/core/CL/kernels/CLFastCornersKernel.h
index 5d0da7d5d5..0c1b564c2f 100644
--- a/arm_compute/core/CL/kernels/CLFastCornersKernel.h
+++ b/src/core/CL/kernels/CLFastCornersKernel.h
@@ -25,8 +25,8 @@
 #define ARM_COMPUTE_CLFASTCORNERSKERNEL_H
 
 #include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include <cstdint>
 
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 1ea654b5cc..5d77c291d7 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -21,19 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLFillBorderKernel.h
rename to src/core/CL/kernels/CLFillBorderKernel.h
index 5323af4c0e..7951f48171 100644
--- a/arm_compute/core/CL/kernels/CLFillBorderKernel.h
+++ b/src/core/CL/kernels/CLFillBorderKernel.h
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_CLFILLBORDERKERNEL_H
 #define ARM_COMPUTE_CLFILLBORDERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.cpp b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
index 6bd1149612..b3f84b6928 100644
--- a/src/core/CL/kernels/CLFlattenLayerKernel.cpp
+++ b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
+#include "src/core/CL/kernels/CLFlattenLayerKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -51,18 +52,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_flatten_shape(input)));
-
-    Window win = calculate_max_window(*input, Steps()); // Flatten does not need paddings
-
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
-    return std::make_pair(Status{}, win);
-}
 } // namespace
 
 CLFlattenLayerKernel::CLFlattenLayerKernel()
@@ -78,16 +67,17 @@ void CLFlattenLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
 void CLFlattenLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_flatten_shape(input->info())));
+
+    auto padding_info = get_padding_info({ input, output });
+
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
@@ -98,6 +88,14 @@ void CLFlattenLayerKernel::configure(const CLCompileContext &compile_context, co
     // Create kernel
     _kernel = create_kernel(compile_context, "flatten", build_opts.options());
 
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    ICLKernel::configure_internal(win);
+
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
     // Set config_id for enabling LWS tuning
     _config_id = "flatten";
     _config_id += "_";
@@ -117,7 +115,6 @@ void CLFlattenLayerKernel::configure(const CLCompileContext &compile_context, co
 Status CLFlattenLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
     return Status{};
 }
 
diff --git a/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h b/src/core/CL/kernels/CLFlattenLayerKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLFlattenLayerKernel.h
rename to src/core/CL/kernels/CLFlattenLayerKernel.h
index 4df0b33c8e..2471cf2e4a 100644
--- a/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h
+++ b/src/core/CL/kernels/CLFlattenLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLFLATTENLAYERKERNEL_H
 #define ARM_COMPUTE_CLFLATTENLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLFloorKernel.cpp b/src/core/CL/kernels/CLFloorKernel.cpp
index 09f5f61a50..2af0089bf0 100644
--- a/src/core/CL/kernels/CLFloorKernel.cpp
+++ b/src/core/CL/kernels/CLFloorKernel.cpp
@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
+#include "src/core/CL/kernels/CLFloorKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLFloorKernel.h b/src/core/CL/kernels/CLFloorKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLFloorKernel.h
rename to src/core/CL/kernels/CLFloorKernel.h
index 3b1d3f10cc..f5635141e4 100644
--- a/arm_compute/core/CL/kernels/CLFloorKernel.h
+++ b/src/core/CL/kernels/CLFloorKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLFLOORKERNEL_H
 #define ARM_COMPUTE_CLFLOORKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index b582295f44..2116239080 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -21,16 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
@@ -119,6 +120,8 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
 
+    auto padding_info = get_padding_info({ input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma });
+
     _input_weights = input_weights;
     _input_bias    = input_bias;
     _bn_mean       = bn_mean;
@@ -171,6 +174,8 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
 
     // Create kernel
     _kernel = create_kernel(compile_context, "fuse_batchnormalization_layer", build_opts.options());
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
diff --git a/arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h
rename to src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
index 3ec251c858..78b1e74cab 100644
--- a/arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H
 #define ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
index 9a2918d12f..1f89865908 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
@@ -21,27 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
 namespace arm_compute
 {
 using namespace misc::shape_calculator;
@@ -111,7 +106,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
     bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
 
     Window win{};
-    Window win_out{};
     bool   window_changed = false;
 
     // In case both input and output have to be reinterpreted as 3D tensors,
@@ -139,28 +133,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
-    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
-    const int m          = reinterpret_output_as_3d ? gemm_info.m() : input0->dimension(1);
-    const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowStatic input0_access(input0, 0, 0,
-                                     input0->dimension(0),
-                                     input0->dimension(1) + bottom_pad);
+    // RHS matrix still needs padding on the X
     AccessWindowStatic input1_access(input1, 0, 0,
                                      ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
                                      input1->dimension(1));
-    AccessWindowStatic output_access(output, 0, 0,
-                                     ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
-                                     output->dimension(1) + bottom_pad);
 
-    window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
-                     update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
-
-    output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
+    window_changed = update_window_and_padding(win, input1_access); // window used by the execute_window_loop
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -199,6 +179,9 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
 
+    // We still need padding on the X dimension for the RHS matrix
+    auto padding_info = get_padding_info({ input0, output });
+
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
     if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
@@ -218,6 +201,18 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
+    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
+    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+    // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
+
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
@@ -229,12 +224,13 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
     build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
     build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
     build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
     std::string kernel_name("gemmlowp_mm_native");
 
     // Create kernel
@@ -260,6 +256,8 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     _config_id += support::cpp11::to_string(rhs_info.n0);
     _config_id += "_";
     _config_id += support::cpp11::to_string(lhs_info.k0);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
rename to src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
index 2d5e4a3346..125f0c6948 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
 #define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
index 56b92a3d41..ded4b29ae7 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
@@ -21,27 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
 namespace arm_compute
 {
 using namespace misc::shape_calculator;
@@ -105,15 +99,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
     bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
 
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)).set_data_type(DataType::S32));
 
     TensorInfo tmp_info(*output);
-
     if(reinterpret_output_as_3d)
     {
         // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
@@ -126,29 +115,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
     // Configure kernel window
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
-
-    // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
-    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
-    const int m          = gemm_info.m();
-    const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic input0_access(input0, 0, 0,
-                                     ceil_to_multiple(input0->dimension(0), num_elems_processed_per_iteration_y),
-                                     input0->dimension(1));
-    AccessWindowStatic input1_access(input1, 0, 0,
-                                     ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
-                                     input1->dimension(1));
-    AccessWindowStatic output_access(output, 0, 0,
-                                     ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
-                                     output->dimension(1) + bottom_pad);
-
-    window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
-                     update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
-
-    output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+    Window win                          = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -156,8 +123,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
+    return std::make_pair(Status{}, collapsed);
 }
 } // namespace
 
@@ -191,6 +157,7 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
     _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
 
+    auto              padding_info = get_padding_info({ input0, input1, output });
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
@@ -198,6 +165,12 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
+
+    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
@@ -216,6 +189,8 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
     build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
 
     std::string kernel_name("gemmlowp_mm_reshaped_");
     kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
@@ -251,6 +226,8 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     _config_id += support::cpp11::to_string(lhs_info.interleave);
     _config_id += "_";
     _config_id += support::cpp11::to_string(rhs_info.interleave);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
rename to src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
index f2eb447834..100100b1b1 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
 #define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
index 4770329b7d..77cea24829 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -21,21 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
@@ -238,27 +237,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
     num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
     num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
 
-    // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
-    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
-    const int m          = reinterpret_output_as_3d ? gemm_info.m : input0->dimension(1);
-    const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
     win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowStatic input0_access(input0, 0, 0,
-                                     ceil_to_multiple(input0->dimension(0), gemm_info.lhs_info.k0),
-                                     input0->dimension(1) + bottom_pad);
-    AccessWindowStatic input1_access(input1, 0, 0,
-                                     input1->dimension(0),
-                                     input1->dimension(1));
-    AccessWindowStatic output_access(output, 0, 0,
-                                     ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
-                                     output->dimension(1) + bottom_pad);
-
-    window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
-                     update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
-
     if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         if(gemm_info.a_offset != 0)
@@ -283,8 +264,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
         }
     }
 
-    output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
-
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
     Window             collapsed             = win;
@@ -337,6 +316,7 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileCon
                                                   output_multipliers != nullptr ? output_multipliers->info() : nullptr,
                                                   output_shifts != nullptr ? output_shifts->info() : nullptr));
 
+    auto                          padding_info = get_padding_info({ input0, input1, output, vector_sum_row });
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
@@ -384,6 +364,19 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
+    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
+    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+    // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
+
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = internal_m % internal_m0;
+    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
@@ -393,13 +386,15 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileCon
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
+    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
     build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
     build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
     build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
 
@@ -462,6 +457,7 @@ void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileCon
     _config_id += support::cpp11::to_string(rhs_info.h0);
     _config_id += "_";
     _config_id += support::cpp11::to_string(rhs_info.interleave);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info,
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
rename to src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
index a2295143de..222a8615e4 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
 #define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
index 6ef9fd2565..c7844b9c28 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
@@ -21,29 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
 Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
@@ -102,39 +92,6 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        int32_t a_offset, int32_t b_offset)
-{
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-    bool                   window_changed                    = false;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win, mm_result_access);
-
-    if(a_offset != 0)
-    {
-        AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
-    }
-    if(b_offset != 0)
-    {
-        AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
-        window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
-    }
-
-    if(bias != nullptr)
-    {
-        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
-        window_changed = window_changed || update_window_and_padding(win, bias_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
@@ -148,7 +105,8 @@ void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const I
     configure(CLKernelLibrary::get().get_compile_context(), mm_result, vector_sum_col, vector_sum_row, bias, k, a_offset, b_offset);
 }
 
-void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
+void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row,
+                                                   const ICLTensor *bias,
                                                    int32_t k, int32_t a_offset,
                                                    int32_t b_offset)
 {
@@ -160,6 +118,8 @@ void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compi
                                                   bias != nullptr ? bias->info() : nullptr,
                                                   a_offset, b_offset)); // NOLINT
 
+    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias });
+
     _vector_sum_col = vector_sum_col;
     _vector_sum_row = vector_sum_row;
     _mm_result      = mm_result;
@@ -170,8 +130,12 @@ void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compi
                                    && mm_result->info()->num_dimensions() > 1
                                    && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
 
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0));
+
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration));
 
     // If a_offset == 0, vector_sum_col can be a nullptr
     if(a_offset != 0)
@@ -192,13 +156,8 @@ void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compi
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(mm_result->info(),
-                                                    vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                    vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                    bias != nullptr ? bias->info() : nullptr,
-                                                    a_offset, b_offset); // NOLINT
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
 
     // Set config_id for enabling LWS tuning
     _config_id = kernel_name + "_";
@@ -207,19 +166,14 @@ void CLGEMMLowpOffsetContributionKernel::configure(const CLCompileContext &compi
     _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
     _config_id += "_";
     _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
                                                     int32_t a_offset, int32_t b_offset)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              a_offset, b_offset)
-                                .first); // NOLINT
-
     return Status{};
 }
 
@@ -258,3 +212,4 @@ void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQu
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
rename to src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
index 1d3b3110b3..f8705595a0 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
 #define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
index 6d3aa6fbf6..b41d8704bd 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -21,23 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
-#include <cstddef>
-#include <cstdint>
-
 namespace arm_compute
 {
 namespace
@@ -120,52 +116,6 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias, ITensorInfo *output,
-                                                        int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
-{
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-    bool                   window_changed                    = false;
-
-    // Auto initialize the output
-    auto_init_if_empty(*output, mm_result->clone()->set_data_type(output_stage.output_data_type));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win, mm_result_access);
-
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win, output_access);
-
-    if(a_offset != 0)
-    {
-        AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
-    }
-    if(b_offset != 0)
-    {
-        AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
-        window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
-    }
-
-    if(bias != nullptr)
-    {
-        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
-        window_changed = window_changed || update_window_and_padding(win, bias_access);
-    }
-
-    if(output_multipliers->dimension(0) > 1)
-    {
-        AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration);
-        AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel()
@@ -202,6 +152,8 @@ void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
                                                   a_offset, b_offset, output_stage,
                                                   output_multipliers->info(), output_shifts->info())); // NOLINT
 
+    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, output, output_multipliers, output_shifts });
+
     const int min = output_stage.gemmlowp_min_bound;
     const int max = output_stage.gemmlowp_max_bound;
 
@@ -219,8 +171,15 @@ void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
                                    && mm_result->info()->num_dimensions() > 1
                                    && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
 
+    // Auto initialize the output
+    auto_init_if_empty(*output->info(), mm_result->info()->clone()->set_data_type(output_stage.output_data_type));
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->info()->dimension(0));
+
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->info()->dimension(0) % num_elems_processed_per_iteration));
 
     // If a_offset == 0, vector_sum_col can be a nullptr
     if(a_offset != 0)
@@ -253,15 +212,8 @@ void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(mm_result->info(),
-                                                    vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
-                                                    vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
-                                                    bias != nullptr ? bias->info() : nullptr,
-                                                    output->info(),
-                                                    a_offset, b_offset, output_stage,
-                                                    output_multipliers->info(), output_shifts->info()); // NOLINT
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
 
     // Set config_id for enabling LWS tuning
     _config_id = kernel_name + "_";
@@ -270,6 +222,8 @@ void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
     _config_id += "_";
     _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
@@ -277,15 +231,6 @@ Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo
                                                                const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output->clone().get(),
-                                                              a_offset, b_offset, output_stage,
-                                                              output_multipliers->clone().get(), output_shifts->clone().get())
-                                .first); // NOLINT
-
     return Status{};
 }
 
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
rename to src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
index e3f88c11e6..15f54d17a5 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
 #define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
@@ -91,7 +91,8 @@ class CLGEMMLowpOffsetContributionOutputStageKernel : public ICLKernel
      * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
      *                                Supported data types: S32
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k,
+    void configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
+                   int32_t k,
                    int32_t a_offset, int32_t b_offset,
                    const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
new file mode 100644
index 0000000000..6a58d5e202
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+
+    // Check biases if exist
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != info->output_data_type, "Mismatching output data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel()
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+Status CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+                                                                    const GEMMLowpOutputStageInfo *info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
+
+    return Status{};
+}
+
+void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                                   const GEMMLowpOutputStageInfo *info)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
+
+    auto padding_info = get_padding_info({ input, bias, output });
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type));
+
+    _input  = input;
+    _bias   = bias;
+    _output = output;
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
+
+    // Set the arguments to pass at compile time
+    auto           min = info->gemmlowp_min_bound;
+    auto           max = info->gemmlowp_max_bound;
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
+    build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
+    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
+    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
+                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
+                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    // Create kernel
+    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint";
+    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    auto win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Create input window
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Setup bias slice
+    unsigned int idx1 = num_arguments_per_3D_tensor();
+    if(_bias != nullptr)
+    {
+        Window biases_slice(slice);
+        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        add_1D_tensor_argument(idx1, _bias, biases_slice);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx1, _output, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h
new file mode 100644
index 0000000000..8653102cd8
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H
+#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16.
+ */
+class CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
+     * @param[in]  info            Output stage info. Used to pass the quantized output data type
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel
+     *
+     * @param[in] input  Input tensor. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QSYMM8/QASYMM8_SIGNED/QSYMM16.
+     * @param[in] info   Output stage info. Used to pass the quantized output data type
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_bias;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
index 242d151272..a5888a5ded 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
@@ -21,19 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -65,38 +64,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, DataType output_data_type)
-{
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_data_type(output_data_type));
-
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win,
-                                                    input_access);
-
-    AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win, output_result_access);
-    output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    if(bias != nullptr)
-    {
-        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
-        window_changed = window_changed || update_window_and_padding(win, bias_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 class Coordinates;
@@ -127,15 +94,24 @@ void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info));
 
+    auto padding_info = get_padding_info({ input, bias, output });
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(info->output_data_type));
+
     _input  = input;
     _bias   = bias;
     _output = output;
 
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
+
     auto min = info->gemmlowp_min_bound;
     auto max = info->gemmlowp_max_bound;
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
     build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
@@ -147,9 +123,10 @@ void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon
     _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), info->output_data_type);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 void CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
rename to src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
index 0b3f23dab3..0a8d5e1942 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
 #define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
index 55e4ed2bd9..7d4352479c 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
@@ -21,17 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -62,41 +61,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, DataType output_data_type)
-{
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_data_type(output_data_type));
-
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win,
-                                                    input_access);
-
-    AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win, output_result_access);
-    output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    if(bias != nullptr)
-    {
-        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
-        window_changed = window_changed || update_window_and_padding(win, bias_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } //namespace
 
 CLGEMMLowpQuantizeDownInt32ScaleKernel::CLGEMMLowpQuantizeDownInt32ScaleKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr), _output_stage(nullptr)
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
 {
 }
+
 Status CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -110,7 +81,8 @@ void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const ICLTensor *input, c
     configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, output_stage);
 }
 
-void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage)
+void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                       const GEMMLowpOutputStageInfo *output_stage)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -120,15 +92,23 @@ void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &c
                                                   output->info(),
                                                   output_stage));
 
-    _input        = input;
-    _bias         = bias;
-    _output       = output;
-    _output_stage = output_stage;
+    auto padding_info = get_padding_info({ input, bias, output });
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_stage->output_data_type));
+
+    _input  = input;
+    _bias   = bias;
+    _output = output;
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->info()->dimension(0));
 
     // Set the arguments to pass at compile time
     auto           min = output_stage->gemmlowp_min_bound;
     auto           max = output_stage->gemmlowp_max_bound;
     CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
     build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
     build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
@@ -143,9 +123,10 @@ void CLGEMMLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &c
     _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), output_stage->output_data_type);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 void CLGEMMLowpQuantizeDownInt32ScaleKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
similarity index 95%
rename from arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
rename to src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
index 767d7927b4..abdf33ea43 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
 #define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
@@ -42,7 +42,7 @@ class ICLTensor;
  *  -# Clamp the value between the specified min and max bounds
  *  -# Clamp the resulting int32 values:
  *  -#  -to the [0..255] range and cast to QASYMM8.
- *  -#  -to the [-128..127] range and cast to QASYMM8/SIGNED.
+ *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
  *
  */
 class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel
@@ -93,10 +93,9 @@ class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor               *_input;
-    const ICLTensor               *_bias;
-    ICLTensor                     *_output;
-    const GEMMLowpOutputStageInfo *_output_stage;
+    const ICLTensor *_input;
+    const ICLTensor *_bias;
+    ICLTensor       *_output;
 };
 } // namespace arm_compute
 
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
deleted file mode 100644
index c98f5bf3eb..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                          int min, int max)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, input);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
-{
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QSYMM16));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access);
-
-    if(output->total_size() != 0)
-    {
-        Window                 win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-        AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win_out, output_result_access);
-
-        output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-
-    if(bias != nullptr)
-    {
-        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
-        window_changed = window_changed || update_window_and_padding(win, bias_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                           int min, int max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (bias != nullptr) ? bias->clone().get() : nullptr,
-                                                              output->clone().get())
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                          int result_fixedpoint_multiplier, int result_shift,
-                                                                          int min, int max)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                          int result_fixedpoint_multiplier, int result_shift,
-                                                                          int min, int max)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(),
-                                                  min, max));
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(result_fixedpoint_multiplier));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift));
-    build_opts.add_option_if((min > -32768), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < 32767), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", build_opts.options());
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Create input window
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Setup bias slice
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(_bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, _bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx1, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
deleted file mode 100644
index fa78410440..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                          int min, int max)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
-{
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8_SIGNED));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access);
-
-    if(output->total_size() != 0)
-    {
-        Window                 win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-        AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win_out, output_result_access);
-        output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-
-    if(bias != nullptr)
-    {
-        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
-        window_changed = window_changed || update_window_and_padding(win, bias_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                          int min, int max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (bias != nullptr) ? bias->clone().get() : nullptr,
-                                                              output->clone().get())
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                         int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                         int min, int max)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                         int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                         int min, int max)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(result_offset_after_shift));
-    build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(result_fixedpoint_multiplier));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.add_option_if((min > -128), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < 127), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_fixedpoint", build_opts.options());
-
-    ICLKernel::configure_internal(win_config.second);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Create input window
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Setup bias slice
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(_bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, _bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx1, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
deleted file mode 100644
index 92335747be..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                          int min, int max)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON(min > max);
-
-    // Check biases if exist
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
-{
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access);
-
-    if(output->total_size() != 0)
-    {
-        Window                 win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-        AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win_out, output_result_access);
-        output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-
-    if(bias != nullptr)
-    {
-        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
-        window_changed = window_changed || update_window_and_padding(win, bias_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                           int min, int max)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (bias != nullptr) ? bias->clone().get() : nullptr,
-                                                              output->clone().get())
-                                .first);
-
-    return Status{};
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                          int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                          int min, int max)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                                          int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                          int min, int max)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    // Set the arguments to pass at compile time
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(result_offset_after_shift));
-    build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(result_fixedpoint_multiplier));
-    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift));
-    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max));
-    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_fixedpoint", build_opts.options());
-
-    ICLKernel::configure_internal(win_config.second);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Create input window
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-
-    // Setup bias slice
-    unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(_bias != nullptr)
-    {
-        Window biases_slice(slice);
-        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        add_1D_tensor_argument(idx1, _bias, biases_slice);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx1, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
-}
-} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
index 31a97ca32b..d508bf6f21 100644
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -58,27 +60,6 @@ Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITe
     }
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)
-{
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, TensorShape(input->dimension(0)), 1, DataType::S32);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowStatic     input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1));
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
@@ -100,6 +81,8 @@ void CLGEMMLowpMatrixAReductionKernel::configure(const CLCompileContext &compile
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*vector_sum_row->info(), TensorShape(mtx_a->info()->dimension(1)), 1, DataType::S32);
 
+    auto padding_info = get_padding_info({ mtx_a, vector_sum_row });
+
     _input  = mtx_a;
     _output = vector_sum_row;
 
@@ -129,6 +112,8 @@ void CLGEMMLowpMatrixAReductionKernel::configure(const CLCompileContext &compile
     _config_id += support::cpp11::to_string(_input->info()->dimension(1));
     _config_id += "_";
     _config_id += support::cpp11::to_string(_input->info()->dimension(2));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
@@ -176,8 +161,17 @@ void CLGEMMLowpMatrixBReductionKernel::configure(const CLCompileContext &compile
     _input  = mtx_b;
     _output = vector_sum_col;
 
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*_output->info(), TensorShape(mtx_b->info()->dimension(0)), 1, DataType::S32);
+
+    auto padding_info = get_padding_info({ mtx_b, vector_sum_col });
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->info()->dimension(0));
+
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0)));
     build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->info()->data_type()));
@@ -188,16 +182,16 @@ void CLGEMMLowpMatrixBReductionKernel::configure(const CLCompileContext &compile
     _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
 
     return Status{};
 }
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h b/src/core/CL/kernels/CLGEMMLowpReductionKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
rename to src/core/CL/kernels/CLGEMMLowpReductionKernel.h
index 6066e2a815..237d8099b7 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
 #define ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index c2dd92c0fd..2419104fba 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -21,22 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
 #include "support/StringSupport.h"
 
 #include <set>
@@ -97,7 +96,7 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
         const int         k                         = reshape_info.k();
         const int         mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
         const int         mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-        rhs_info.n0                                 = 16 / input1->element_size();
+        rhs_info.n0                                 = max_cl_vector_width / input1->element_size();
         rhs_info.k0                                 = 1;
         rhs_info.h0                                 = mult_transpose1xW_width;
         rhs_info.interleave                         = false;
@@ -198,22 +197,7 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
         num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
         num_elems_processed_per_iteration_y = 4;
 
-        // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
-        // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
-        const int m          = reshape_info.m();
-        const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
-        win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1));
-        AccessWindowStatic input1_access(input1, 0, 0,
-                                         ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
-                                         ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
-        AccessWindowStatic output_access(output, 0, 0,
-                                         ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
-                                         output->dimension(1) + bottom_pad);
-
+        win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
         if(input2 != nullptr)
         {
             const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
@@ -224,16 +208,8 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
                                              ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
                                              ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
 
-            window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
-                             update_window_and_padding(win_out, output_access);                             // window used to update the padding requirements of output tensor
-        }
-        else
-        {
-            window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
-                             update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
+            window_changed = update_window_and_padding(win, input2_access); // window used by the execute_window_loop
         }
-
-        output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
     }
     else // The input tensors have not been reshaped
     {
@@ -241,11 +217,6 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
         num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
         num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
 
-        // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
-        // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
-        const int m          = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2] : input0->tensor_shape()[1];
-        const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
         // Create kernels according to the architecture, data type and input size.
         GPUTarget arch_target = get_arch_from_target(gpu_target);
         if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
@@ -256,22 +227,19 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
         // Configure window
         win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
         win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1) + bottom_pad);
+        AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1));
         AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
         AccessWindowStatic output_access(output, 0, 0,
-                                         ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
-                                         output->dimension(1) + bottom_pad);
+                                         output->dimension(0),
+                                         output->dimension(1));
 
         if(input2 != nullptr)
         {
             const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
 
-            const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y;
-
             AccessWindowStatic input2_access(input2, 0, 0,
                                              ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
-                                             ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
+                                             input2->dimension(1));
 
             window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
                              update_window_and_padding(win_out, output_access);                             // window used to update the padding requirements of output tensor
@@ -310,7 +278,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
     configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision, activation_info);
 }
 
-void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
+void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha,
+                                           float beta,
                                            bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
@@ -319,6 +288,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta,
                                                   is_interleaved_transposed, reshape_info, fp_mixed_precision));
 
+    auto padding_info = is_interleaved_transposed ? get_padding_info({ input0, input1, output }) : get_padding_info({ input0, output });
+
     _input0                   = input0;
     _input1                   = input1;
     _input2                   = helpers::float_ops::is_zero(beta) ? nullptr : input2;
@@ -354,6 +325,22 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
+    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, both will be turned off (false)
+    // in which case we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+    // This means that the actual m used by the kernel is given by output->info()->dimension(1)
+    const unsigned int internal_m = _reinterpret_output_as_3d ? output->info()->dimension(1) * output->info()->dimension(2) : output->info()->dimension(1);
+    const unsigned int n          = output->info()->dimension(0);
+
+    const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
+    const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
+
+    const unsigned int m0 = num_elements_processed.y();
+    const unsigned int n0 = num_elements_processed.x();
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = internal_m % m0;
+    const unsigned int partial_store_n0 = n % n0;
+
     // Create build options
     CLBuildOptions build_opts;
 
@@ -363,8 +350,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
     build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
     build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation())));
     build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a()));
@@ -378,9 +365,13 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
         const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
         const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
 
-        build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
-        build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
-        build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
+        build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
+        build_opts.add_option("-DN=" + support::cpp11::to_string(n));
+        build_opts.add_option("-DK=" + support::cpp11::to_string(input1->info()->dimension(0) / (n0 * mult_transpose1xW_width)));
+        build_opts.add_option("-DH0=" + support::cpp11::to_string(mult_transpose1xW_width));
+        build_opts.add_option("-DV0=" + support::cpp11::to_string(mult_interleave4x4_height));
+        build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+        build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
 
         if(is_data_type_float(data_type) && is_bifrost)
         {
@@ -398,8 +389,13 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
     }
     else // The input tensors have not been reshaped
     {
-        build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+        build_opts.add_option("-DN=" + support::cpp11::to_string(n));
+        build_opts.add_option("-DK=" + support::cpp11::to_string(input0->info()->dimension(0)));
         build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+        build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+        build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+        build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+        build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
 
         // Create kernels according to the architecture, data type and input size.
         if(is_data_type_float(data_type) && is_bifrost)
@@ -431,8 +427,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
         {
             kernel_name = "gemm_mm_floating_point";
         }
-        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
-        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
     }
 
     // Create kernel
@@ -457,6 +451,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
     _config_id += support::cpp11::to_string(output->info()->dimension(3));
     _config_id += "_";
     _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
rename to src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
index 4abd60c202..71d223b8ac 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H
 #define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
index da57aa447f..387f1a4ebc 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
@@ -21,22 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
@@ -220,6 +219,7 @@ void CLGEMMMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr ? input2->info() : nullptr), output->info(), alpha, beta, lhs_info, rhs_info, gemm_info));
 
+    auto padding_info         = get_padding_info({ input0, output });
     _input0                   = input0;
     _input1                   = input1;
     _input2                   = helpers::float_ops::is_zero(beta) ? nullptr : input2;
@@ -318,6 +318,8 @@ void CLGEMMMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     _config_id += support::cpp11::to_string(rhs_info.n0);
     _config_id += "_";
     _config_id += support::cpp11::to_string(rhs_info.k0);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
rename to src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
index 006b2bf91f..6b6004b464 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
 #define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include "arm_compute/core/KernelDescriptors.h"
 
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
index 8f20de1ea1..23e18bac92 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
@@ -21,25 +21,24 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
@@ -226,6 +225,7 @@ void CLGEMMMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr ? input2->info() : nullptr), output->info(), alpha, beta, lhs_info, rhs_info, gemm_info));
 
+    auto padding_info         = get_padding_info({ input0, output });
     _input0                   = input0;
     _input1                   = input1;
     _input2                   = helpers::float_ops::is_zero(beta) ? nullptr : input2;
@@ -329,6 +329,8 @@ void CLGEMMMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     _config_id += support::cpp11::to_string(lhs_info.interleave);
     _config_id += "_";
     _config_id += support::cpp11::to_string(rhs_info.interleave);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMMatrixMultiplyReshapedKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
@@ -376,7 +378,7 @@ void CLGEMMMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQu
         const TensorShape shape2d(_input1->info()->dimension(0) / 4, _input1->info()->dimension(1) * _input1->info()->dimension(2));
         const size_t      image_row_pitch = _input1->info()->strides_in_bytes()[1];
 
-        input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, CL_FLOAT, image_row_pitch);
+        input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, _input1->info()->data_type(), image_row_pitch);
     }
 
     do
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
rename to src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
index 962645749e..2ffc322def 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
 #define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include "arm_compute/core/KernelDescriptors.h"
 
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
index cf77c70bfa..1f296f8e26 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -21,19 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
 #include "support/StringSupport.h"
 
 #include <tuple>
@@ -132,7 +133,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    // This approach should only be used when the input/output tensors have pad on the y direction
+    if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
     {
         reinterpret_output_as_3d = false;
     }
@@ -158,16 +160,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
     win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowStatic input0_access(input0, 0, 0,
-                                     input0->dimension(0),
-                                     input0->dimension(1));
-    AccessWindowStatic input1_access(input1, 0, 0,
-                                     input1->dimension(0),
-                                     input1->dimension(1));
-    AccessWindowStatic output_access(output, 0, 0,
-                                     output->dimension(0),
-                                     output->dimension(1));
-
     if(input2 != nullptr)
     {
         const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
@@ -176,16 +168,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
                                          ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
                                          input2->dimension(1));
 
-        window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, output_access);                             // window used to update the padding requirements of output tensor
+        window_changed = update_window_and_padding(win, input2_access);
     }
-    else
-    {
-        window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
-    }
-
-    output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -200,7 +184,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
 
 CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel()
     : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false),
-      _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false)
+      _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false), _has_pad_y(false)
 {
 }
 
@@ -231,10 +215,13 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext
     _add_bias                 = _input2 != nullptr;
     _broadcast_bias           = gemm_info.broadcast_bias;
     _export_to_cl_image       = rhs_info.export_to_cl_image;
+    _has_pad_y                = gemm_info.has_pad_y;
+
+    auto padding_info = get_padding_info({ input0, input1, output });
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -256,28 +243,25 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext
     // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
     const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
 
+    // These variables are used only if gemm_info.has_pad_y == true
     const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
     const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
 
-    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
-    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
-    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
     // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
     // NOTE: This might have implications on heuristics and performance
     const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
 
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = internal_m % internal_m0;
+    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
     build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
@@ -295,6 +279,13 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext
     build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
     build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+    if(_has_pad_y)
+    {
+        build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+        build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
+    }
 
     std::string kernel_name("gemm_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
@@ -306,6 +297,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext
     // Set config_id for enabling LWS tuning
     _config_id = kernel_name;
     _config_id += "_";
+    _config_id += (_has_pad_y ? "" : "no_pad_y_");
     _config_id += (_add_bias ? "add_bias_" : "");
     _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
     _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
@@ -330,6 +322,8 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext
     _config_id += support::cpp11::to_string(rhs_info.h0);
     _config_id += "_";
     _config_id += support::cpp11::to_string(rhs_info.interleave);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
@@ -362,15 +356,24 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co
         ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
     }
 
+    const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u;
+    const size_t rhs_idx_batch_size = 2u;
+    const size_t bia_idx_batch_size = 2u;
+    const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u;
+
     Window slice          = window.first_slice_window_3D();
     Window slice_matrix_b = slice;
 
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
+    // Get cross plane pads
     const unsigned int total_cross_plane_pad_lhs = _input0->info()->padding().top + _input0->info()->padding().bottom;
     const unsigned int total_cross_plane_pad_out = _output->info()->padding().top + _output->info()->padding().bottom;
 
+    // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor
+    ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0)));
+
     cl::Image2D input1_image2d;
 
     if(_export_to_cl_image)
@@ -378,7 +381,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co
         const TensorShape shape2d(_input1->info()->dimension(0) / 4, _input1->info()->dimension(1) * _input1->info()->dimension(2));
         const size_t      image_row_pitch = _input1->info()->strides_in_bytes()[1];
 
-        input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, CL_FLOAT, image_row_pitch);
+        input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, _input1->info()->data_type(), image_row_pitch);
     }
 
     do
@@ -413,28 +416,28 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co
         add_2D_tensor_argument(idx, _output, slice);
 
         // LHS stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[lhs_idx_batch_size]));
 
         // RHS stride_z (not used if _export_to_cl_image == true)
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[rhs_idx_batch_size]));
 
         // Bias stride_z (if _add_bias == true)
         if(_add_bias)
         {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[bia_idx_batch_size]));
         }
 
         // Output stride_z
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[out_idx_batch_size]));
 
         // Cross-plan padding (if _reinterpret_input_as_3d = true)
-        if(_reinterpret_input_as_3d)
+        if(_reinterpret_input_as_3d && _has_pad_y)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
         }
 
         // Cross-plan padding (if _reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d)
+        if(_reinterpret_output_as_3d && _has_pad_y)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
         }
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
rename to src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
index fc21f2a0f6..5b96679a46 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
 #define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include "arm_compute/core/KernelDescriptors.h"
 
@@ -162,6 +162,7 @@ class CLGEMMMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
     bool             _add_bias;
     bool             _broadcast_bias;
     bool             _export_to_cl_image;
+    bool             _has_pad_y;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H*/
diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
index f52384593b..ee0abc56d3 100644
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
@@ -21,17 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
rename to src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
index 95ed87d95b..bef8c231ac 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMMATRIXVECTORMULTIPLYKERNEL_H
 #define ARM_COMPUTE_CLGEMMMATRIXVECTORMULTIPLYKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
index 156a657f28..52510075b7 100644
--- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
@@ -21,21 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -126,6 +125,8 @@ void CLGEMMReshapeLHSMatrixKernel::configure(const CLCompileContext &compile_con
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), lhs_info, reinterpret_input_as_3d));
 
+    auto padding_info = get_padding_info({ input });
+
     _input                   = input;
     _output                  = output;
     _reinterpret_input_as_3d = reinterpret_input_as_3d;
@@ -181,6 +182,8 @@ void CLGEMMReshapeLHSMatrixKernel::configure(const CLCompileContext &compile_con
     _config_id += support::cpp11::to_string(lhs_info.interleave);
     _config_id += "_";
     _config_id += support::cpp11::to_string(lhs_info.transpose);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMReshapeLHSMatrixKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
rename to src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
index 0f74cb85e4..92202a26fc 100644
--- a/arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
+++ b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H
 #define ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
index c1993b72b9..33de61ed01 100644
--- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
@@ -21,22 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -56,15 +55,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
 
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
+
     if(rhs_info.export_to_cl_image)
     {
-        const TensorInfo tensor_reshaped_info(compute_rhs_reshaped_shape(*input, rhs_info), 1, DataType::F32);
+        const TensorInfo tensor_reshaped_info(compute_rhs_reshaped_shape(*input, rhs_info), 1, input->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(cl_gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info));
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_rhs_reshaped_shape(*input, rhs_info));
diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
similarity index 96%
rename from arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
rename to src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
index 5f953ddf8d..911484ea76 100644
--- a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
+++ b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H
 #define ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
@@ -46,6 +46,8 @@ class CLGEMMReshapeRHSMatrixKernel : public ICLKernel
     CLGEMMReshapeRHSMatrixKernel(CLGEMMReshapeRHSMatrixKernel &&) = default;
     /** Allow instances of this class to be moved */
     CLGEMMReshapeRHSMatrixKernel &operator=(CLGEMMReshapeRHSMatrixKernel &&) = default;
+    /** Default destructor */
+    ~CLGEMMReshapeRHSMatrixKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
@@ -53,7 +55,7 @@ class CLGEMMReshapeRHSMatrixKernel : public ICLKernel
      *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
      *       -# rhs_info.n0 can only be 4, 8 and 16
      *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
+     *       -# Data type can only be F32, F16
      *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
      *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
      *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
@@ -77,7 +79,7 @@ class CLGEMMReshapeRHSMatrixKernel : public ICLKernel
      *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
      *       -# rhs_info.n0 can only be 4, 8 and 16
      *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
+     *       -# Data type can only be F32, F16
      *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
      *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
      *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
@@ -102,7 +104,7 @@ class CLGEMMReshapeRHSMatrixKernel : public ICLKernel
      *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
      *       -# rhs_info.n0 can only be 4, 8 and 16
      *       -# rhs_info.k0 can only be 4, 8 and 16
-     *       -# Data type can only be F32
+     *       -# Data type can only be F32, F16
      *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
      *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
      *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index 57759fc1c1..9e802c20fb 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <string>
diff --git a/arm_compute/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGatherKernel.h
rename to src/core/CL/kernels/CLGatherKernel.h
index c8a96327b6..8f472a4696 100644
--- a/arm_compute/core/CL/kernels/CLGatherKernel.h
+++ b/src/core/CL/kernels/CLGatherKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLGATHERKERNEL_H
 #define ARM_COMPUTE_CLGATHERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
index 08e7e27b3c..40e9658ab4 100644
--- a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
+#include "src/core/CL/kernels/CLGaussian3x3Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h b/src/core/CL/kernels/CLGaussian3x3Kernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
rename to src/core/CL/kernels/CLGaussian3x3Kernel.h
index a783527de4..139b05d44c 100644
--- a/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
+++ b/src/core/CL/kernels/CLGaussian3x3Kernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H
 #define ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
index 5b3639f025..46a7576154 100644
--- a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
+++ b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
 
 #include <cstdint>
 
diff --git a/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h b/src/core/CL/kernels/CLGaussian5x5Kernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
rename to src/core/CL/kernels/CLGaussian5x5Kernel.h
index e8c2268e26..711710b3b3 100644
--- a/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
+++ b/src/core/CL/kernels/CLGaussian5x5Kernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H
 #define ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H
 
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+#include "src/core/CL/kernels/CLConvolutionKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
index 0e20187d1c..065f7f7e92 100644
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
@@ -21,13 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h b/src/core/CL/kernels/CLGaussianPyramidKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
rename to src/core/CL/kernels/CLGaussianPyramidKernel.h
index 36e095d4d1..a6595440f6 100644
--- a/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H
 #define ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
+#include "src/core/CL/ICLSimpleKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 3108ad87d0..dd3faf50a2 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
@@ -21,19 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
+#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h
rename to src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
index 9dfe4a42ce..d26795ac7d 100644
--- a/arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLGENERATEPROPOSALSLAYERKERNEL_H
 #define ARM_COMPUTE_CLGENERATEPROPOSALSLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 namespace arm_compute
 {
 class ICLTensor;
diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
index 7f618b294b..cd3f1ee216 100644
--- a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
@@ -21,18 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h b/src/core/CL/kernels/CLHOGDescriptorKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
rename to src/core/CL/kernels/CLHOGDescriptorKernel.h
index c001aa2c9f..eee2fa36bc 100644
--- a/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.h
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H
 #define ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/IHOG.h"
 #include "arm_compute/core/Size2D.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
index fbd2208894..861155b9a2 100644
--- a/src/core/CL/kernels/CLHOGDetectorKernel.cpp
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
@@ -21,18 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLHOG.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h b/src/core/CL/kernels/CLHOGDetectorKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
rename to src/core/CL/kernels/CLHOGDetectorKernel.h
index dc9bba8f20..c28e6ebe74 100644
--- a/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.h
@@ -26,8 +26,8 @@
 
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLHOG.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/OpenCL.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace cl
 {
diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
index 08e670f5d2..cbc056fb77 100644
--- a/src/core/CL/kernels/CLHarrisCornersKernel.cpp
+++ b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
@@ -21,19 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "src/core/CL/kernels/CLHarrisCornersKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h b/src/core/CL/kernels/CLHarrisCornersKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
rename to src/core/CL/kernels/CLHarrisCornersKernel.h
index 38a2f04adf..6482b0aa4e 100644
--- a/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
+++ b/src/core/CL/kernels/CLHarrisCornersKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLHARRISCORNERSKERNEL_H
 #define ARM_COMPUTE_CLHARRISCORNERSKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include <cstdint>
 
diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
index 22b2cfcbc5..8aa7366d50 100644
--- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
@@ -21,17 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
@@ -39,20 +39,6 @@ namespace arm_compute
 {
 namespace
 {
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration)
-{
-    num_elems_processed_per_iteration = 4;
-    // The window needs to be based on input as we copy all the heights of input
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
-
-    Window win_collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win_collapsed);
-}
 Status validate_arguments(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -72,15 +58,13 @@ Status validate_arguments(const ITensorInfo *input, unsigned int height_offset,
 } // namespace
 
 CLHeightConcatenateLayerKernel::CLHeightConcatenateLayerKernel()
-    : _height_offset(0), _num_elems_processed_per_iteration()
+    : _height_offset(0)
 {
 }
 
 Status CLHeightConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output)
 {
-    unsigned int num_elems_processed_per_iteration;
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, height_offset, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first);
     return Status{};
 }
 
@@ -89,16 +73,19 @@ void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_c
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, height_offset, output));
 
-    _height_offset = height_offset;
+    auto padding_info = get_padding_info({ input, output });
 
-    auto win_config = validate_and_configure_window(input, output, _num_elems_processed_per_iteration);
+    _height_offset = height_offset;
 
     // Add build options
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, input->dimension(0));
+
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->element_size()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
     build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->dimension(2)));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
 
     if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
     {
@@ -115,12 +102,14 @@ void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_c
     _kernel = create_kernel(compile_context, "concatenate_height", build_opts.options());
     // Configure kernel window
 
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    ICLKernel::configure_internal(std::get<1>(win_config));
+    // The window needs to be based on input as we copy all the heights of input
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
 
     // Set output valid region
     output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 void CLHeightConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
diff --git a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.h
similarity index 97%
rename from arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h
rename to src/core/CL/kernels/CLHeightConcatenateLayerKernel.h
index 4fa2b40881..f4cb627052 100644
--- a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h
+++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.h
@@ -25,8 +25,8 @@
 #ifndef ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H
 #define ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
@@ -72,7 +72,6 @@ class CLHeightConcatenateLayerKernel : public ICLKernel
 
 private:
     unsigned int _height_offset;
-    unsigned int _num_elems_processed_per_iteration;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
index b8a4e8619d..ca5322aa51 100644
--- a/src/core/CL/kernels/CLHistogramKernel.cpp
+++ b/src/core/CL/kernels/CLHistogramKernel.cpp
@@ -21,18 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
+#include "src/core/CL/kernels/CLHistogramKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLDistribution1D.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstring>
diff --git a/arm_compute/core/CL/kernels/CLHistogramKernel.h b/src/core/CL/kernels/CLHistogramKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLHistogramKernel.h
rename to src/core/CL/kernels/CLHistogramKernel.h
index 7cb79db6e9..9c97c6590d 100644
--- a/arm_compute/core/CL/kernels/CLHistogramKernel.h
+++ b/src/core/CL/kernels/CLHistogramKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLHISTOGRAMKERNEL_H
 #define ARM_COMPUTE_CLHISTOGRAMKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index c94e313b9a..07309de83c 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -21,20 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
@@ -317,7 +317,8 @@ void CLIm2ColKernel::configure(const CLCompileContext &compile_context, const IC
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
 
-    _data_layout = input->info()->data_layout();
+    auto padding_info = get_padding_info({ input, output });
+    _data_layout      = input->info()->data_layout();
 
     const unsigned int width_idx    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
@@ -358,6 +359,8 @@ void CLIm2ColKernel::configure(const CLCompileContext &compile_context, const IC
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
     _config_id += "_";
     _config_id += lower_string(string_from_data_layout(_data_layout));
+
+    ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
 }
 
 Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/src/core/CL/kernels/CLIm2ColKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLIm2ColKernel.h
rename to src/core/CL/kernels/CLIm2ColKernel.h
index 7b7bd03108..2920c7d138 100644
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ b/src/core/CL/kernels/CLIm2ColKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLIM2COLKERNEL_H
 #define ARM_COMPUTE_CLIM2COLKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Size2D.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index 2ad5233de8..4c3b404be7 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
@@ -21,16 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
rename to src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
index a3fdd3c4e7..d4444f0b20 100644
--- a/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include "arm_compute/core/KernelDescriptors.h"
 
diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
index aff4bd9cea..5e5683d231 100644
--- a/src/core/CL/kernels/CLIntegralImageKernel.cpp
+++ b/src/core/CL/kernels/CLIntegralImageKernel.cpp
@@ -21,16 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+#include "src/core/CL/kernels/CLIntegralImageKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/arm_compute/core/CL/kernels/CLIntegralImageKernel.h b/src/core/CL/kernels/CLIntegralImageKernel.h
similarity index 97%
rename from arm_compute/core/CL/kernels/CLIntegralImageKernel.h
rename to src/core/CL/kernels/CLIntegralImageKernel.h
index cef699ab54..0e40e3afbc 100644
--- a/arm_compute/core/CL/kernels/CLIntegralImageKernel.h
+++ b/src/core/CL/kernels/CLIntegralImageKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H
 #define ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index a68d8db3c0..9e91d98f7c 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -21,17 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
+#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h
rename to src/core/CL/kernels/CLL2NormalizeLayerKernel.h
index 55fe563954..edc0585217 100644
--- a/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
 #define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
index fae5fe2c8e..a439c2448e 100644
--- a/src/core/CL/kernels/CLLKTrackerKernel.cpp
+++ b/src/core/CL/kernels/CLLKTrackerKernel.cpp
@@ -21,18 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+#include "src/core/CL/kernels/CLLKTrackerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cmath>
 
diff --git a/arm_compute/core/CL/kernels/CLLKTrackerKernel.h b/src/core/CL/kernels/CLLKTrackerKernel.h
similarity index 89%
rename from arm_compute/core/CL/kernels/CLLKTrackerKernel.h
rename to src/core/CL/kernels/CLLKTrackerKernel.h
index fdc2ef8333..2d2966854a 100644
--- a/arm_compute/core/CL/kernels/CLLKTrackerKernel.h
+++ b/src/core/CL/kernels/CLLKTrackerKernel.h
@@ -25,8 +25,8 @@
 #define ARM_COMPUTE_CLLKTRACKERKERNEL_H
 
 #include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -35,40 +35,6 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** Internal keypoint structure for Lucas-Kanade Optical Flow */
-struct CLLKInternalKeypoint
-{
-    float x{ 0.f };               /**< x coordinate of the keypoint */
-    float y{ 0.f };               /**< y coordinate of the keypoint */
-    float tracking_status{ 0.f }; /**< the tracking status of the keypoint */
-    float dummy{ 0.f };           /**< Dummy field, to make sure the data structure 128-bit align, so that GPU can use vload4 */
-};
-
-/** Structure for storing Spatial Gradient Matrix and the minimum eigenvalue for each keypoint */
-struct CLCoefficientTable
-{
-    float A11;     /**< iA11 * FLT_SCALE */
-    float A12;     /**< iA11 * FLT_SCALE */
-    float A22;     /**< iA11 * FLT_SCALE */
-    float min_eig; /**< Minimum eigenvalue */
-};
-
-/** Structure for storing ival, ixval and iyval for each point inside the window */
-struct CLOldValue
-{
-    int16_t ival;  /**< ival extracts from old image */
-    int16_t ixval; /**< ixval extracts from scharr Gx image */
-    int16_t iyval; /**< iyval extracts from scharr Gy image */
-    int16_t dummy; /**< Dummy field, to make sure the data structure 128-bit align, so that GPU can use vload4 */
-};
-
-/** Interface for OpenCL Array of Internal Key Points. */
-using ICLLKInternalKeypointArray = ICLArray<CLLKInternalKeypoint>;
-/** Interface for OpenCL Array of Coefficient Tables. */
-using ICLCoefficientTableArray = ICLArray<CLCoefficientTable>;
-/** Interface for OpenCL Array of Old Values. */
-using ICLOldValArray = ICLArray<CLOldValue>;
-
 /** Interface to run the initialization step of LKTracker */
 class CLLKTrackerInitKernel : public ICLKernel
 {
diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
index 0da0d4ca1f..49e04c32c2 100644
--- a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
@@ -21,17 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
rename to src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
index d5653f83ea..5d0a22afa5 100644
--- a/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
 #define ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
index ef8ebd52e5..9845dd6169 100644
--- a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
+++ b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
@@ -21,17 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h b/src/core/CL/kernels/CLMagnitudePhaseKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
rename to src/core/CL/kernels/CLMagnitudePhaseKernel.h
index a741b1745a..514036b2ff 100644
--- a/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
+++ b/src/core/CL/kernels/CLMagnitudePhaseKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H
 #define ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
index 08c74642f4..2a1312af94 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
@@ -21,21 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
rename to src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
index 9d51f6b59c..86267ec0f7 100644
--- a/arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLMAXUNPOOLINGLAYERKERNEL_H
 #define ARM_COMPUTE_CLMAXUNPOOLINGLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
index 33099c928d..aed6e6eaf7 100644
--- a/src/core/CL/kernels/CLMeanStdDevKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
@@ -21,18 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "src/core/CL/kernels/CLMeanStdDevKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cmath>
 #include <set>
diff --git a/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h b/src/core/CL/kernels/CLMeanStdDevKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
rename to src/core/CL/kernels/CLMeanStdDevKernel.h
index 2a5a5f2e33..179a2025b7 100644
--- a/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLMEANSTDDEVKERNEL_H
 #define ARM_COMPUTE_CLMEANSTDDEVKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace cl
 {
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index 5ecbb4b2a6..a889df7930 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
@@ -21,18 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
rename to src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
index ff5e9ab0f7..a1ba2b905e 100644
--- a/arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H
 #define ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
index 5f8c9e5a93..23a21d6b19 100644
--- a/src/core/CL/kernels/CLMedian3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+#include "src/core/CL/kernels/CLMedian3x3Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h b/src/core/CL/kernels/CLMedian3x3Kernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
rename to src/core/CL/kernels/CLMedian3x3Kernel.h
index ccb475360f..8cc5ed7279 100644
--- a/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
+++ b/src/core/CL/kernels/CLMedian3x3Kernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLMEDIAN3X3KERNEL_H
 #define ARM_COMPUTE_CLMEDIAN3X3KERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLMemsetKernel.cpp b/src/core/CL/kernels/CLMemsetKernel.cpp
index f591c2f6d5..2543b07a1a 100644
--- a/src/core/CL/kernels/CLMemsetKernel.cpp
+++ b/src/core/CL/kernels/CLMemsetKernel.cpp
@@ -21,10 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLMemsetKernel.h b/src/core/CL/kernels/CLMemsetKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLMemsetKernel.h
rename to src/core/CL/kernels/CLMemsetKernel.h
index 5bda480306..dc103f580f 100644
--- a/arm_compute/core/CL/kernels/CLMemsetKernel.h
+++ b/src/core/CL/kernels/CLMemsetKernel.h
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_CLMEMSETKERNEL_H
 #define ARM_COMPUTE_CLMEMSETKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
index 5f0e48dbb9..7017efa3c2 100644
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
@@ -21,16 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
+#include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <climits>
diff --git a/arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h b/src/core/CL/kernels/CLMinMaxLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h
rename to src/core/CL/kernels/CLMinMaxLayerKernel.h
index a693cfdb27..aa2ff3f375 100644
--- a/arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
 #define ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
index 9bbda40782..675cfc19a9 100644
--- a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
@@ -21,14 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <climits>
diff --git a/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h b/src/core/CL/kernels/CLMinMaxLocationKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
rename to src/core/CL/kernels/CLMinMaxLocationKernel.h
index fbcf69752c..2196abe033 100644
--- a/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.h
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H
 
 #include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include <array>
 
diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
index 16e5113c62..c73acaf1d8 100644
--- a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
+++ b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
@@ -21,18 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+#include "src/core/CL/kernels/CLNonLinearFilterKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <cmath>
diff --git a/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h b/src/core/CL/kernels/CLNonLinearFilterKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
rename to src/core/CL/kernels/CLNonLinearFilterKernel.h
index cee64480b6..ed42063d2b 100644
--- a/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
+++ b/src/core/CL/kernels/CLNonLinearFilterKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H
 #define ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 #include <cstdint>
 
diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
index 958d94ce11..7d5c5ba7e1 100644
--- a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
rename to src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
index d1bba4f480..d9ed60ce6b 100644
--- a/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
+++ b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H
 #define ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 7d8e5db2b4..d1982e77b9 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -21,17 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/NormalizationHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
rename to src/core/CL/kernels/CLNormalizationLayerKernel.h
index 6233d28b0a..739a2ae9f1 100644
--- a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index 00bdac3441..18cbe217be 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -21,17 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
rename to src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
index 2e2e60df0b..6db4433e78 100644
--- a/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H
 #define ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index c05df61edf..485676667c 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -21,11 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -135,7 +137,6 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DSELECT_DT=" + get_cl_select_type_from_data_type(data_type));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
     build_opts.add_option("-DPAD_X_BEFORE=" + support::cpp11::to_string(pad_x_before));
     build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
diff --git a/arm_compute/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLPadLayerKernel.h
rename to src/core/CL/kernels/CLPadLayerKernel.h
index 5bf5841803..2b0abb18df 100644
--- a/arm_compute/core/CL/kernels/CLPadLayerKernel.h
+++ b/src/core/CL/kernels/CLPadLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLPADLAYERKERNEL_H
 #define ARM_COMPUTE_CLPADLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index 1636e5a1bc..4d289f28e6 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
+#include "src/core/CL/kernels/CLPermuteKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
@@ -75,16 +76,16 @@ void CLPermuteKernel::configure(const ICLTensor *input, ICLTensor *output, const
 void CLPermuteKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    const TensorShape output_shape = get_output_shape(input->info(), perm);
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
 
     _input  = input;
     _output = output;
     _perm   = perm;
 
-    const TensorShape output_shape = get_output_shape(input->info(), perm);
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
diff --git a/arm_compute/core/CL/kernels/CLPermuteKernel.h b/src/core/CL/kernels/CLPermuteKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLPermuteKernel.h
rename to src/core/CL/kernels/CLPermuteKernel.h
index bb841b1c83..d1bb875d7a 100644
--- a/arm_compute/core/CL/kernels/CLPermuteKernel.h
+++ b/src/core/CL/kernels/CLPermuteKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLPERMUTEKERNEL_H
 #define ARM_COMPUTE_CLPERMUTEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index 229937ef31..a6255f8018 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -21,15 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
rename to src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
index 6b5bd11bde..0cc4005875 100644
--- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H
 #define ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index d60e196b7f..79843cd299 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -21,20 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLPoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
@@ -50,10 +51,14 @@ namespace
 // Internal window config info
 using CLPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
 
-void auto_init(const ITensorInfo *input, ITensorInfo *output, PoolingLayerInfo pool_info)
+void auto_init(const ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, PoolingLayerInfo pool_info)
 {
     TensorShape out_shape = compute_pool_shape(*input, pool_info);
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(out_shape));
+    if(indices)
+    {
+        auto_init_if_empty(*indices, input->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
+    }
 }
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
@@ -63,16 +68,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type == PoolingType::L2),
                                     "Unsupported combination of parameters!");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
-                                    && (input->data_layout() == DataLayout::NHWC),
-                                    "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
+
     // Check indices
     if(indices)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+
+        if(indices->total_size() != 0)
+        {
+            TensorInfo idx_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, DataType::U32));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
+        }
     }
 
     // Checks performed when output is configured
@@ -108,9 +116,9 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
     const int  pool_pad_top    = pad_stride_info.pad_top();
     const int  pool_pad_left   = pad_stride_info.pad_left();
     const int  pool_pad_bottom = pad_stride_info.pad_bottom();
-    BorderSize border_size     = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
+    BorderSize border_size     = BorderSize();
 
-    auto_init(input, output, pool_info);
+    auto_init(input, output, indices, pool_info);
     pooled_w = output->tensor_shape()[idx_width];
     pooled_h = output->tensor_shape()[idx_height];
 
@@ -126,6 +134,8 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
     {
         case DataLayout::NCHW:
         {
+            // Initialize border size
+            border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
             // Change the number of elements processed per iteration
             // for pooling 3x3 with stride less equal than 3
             const bool can_optimize                         = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
@@ -165,27 +175,17 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
         }
         case DataLayout::NHWC:
         {
-            num_elems_processed_per_iteration = 8;
+            // Initialize border size
+            border_size                       = BorderSize();
+            num_elems_processed_per_iteration = adjust_vec_size(4, output->dimension(0));
             win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-            AccessWindowStatic input_access(input,
-                                            0, -1,
-                                            ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1));
-            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-            // Update indices window
-            if(indices)
-            {
-                AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
-                window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
-                indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
-            }
-            else
+            if(indices != nullptr)
             {
-                window_changed = update_window_and_padding(win, input_access, output_access);
+                indices->set_valid_region(ValidRegion(Coordinates(), indices->tensor_shape()));
             }
 
-            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+            output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
             break;
         }
         default:
@@ -216,6 +216,8 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
+    auto padding_info = get_padding_info({ input, output, indices });
+
     // Set instance variables
     _input                              = input;
     _output                             = output;
@@ -228,6 +230,7 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
     const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
     const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int           idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
     const int           pool_size_x     = pool_info.is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width;
     const int           pool_size_y     = pool_info.is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
@@ -246,17 +249,11 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
     ICLKernel::configure_internal(std::get<1>(win_config));
 
-    if(_data_layout == DataLayout::NCHW)
-    {
-        CLPoolingConfig pooling_config     = std::get<2>(win_config);
-        _num_elems_processed_per_iteration = pooling_config.first;
-        _border_size                       = pooling_config.second;
-    }
-    else
-    {
-        _border_size                       = BorderSize(1, 0, 0, 0);
-        _num_elems_processed_per_iteration = 8;
-    }
+    CLPoolingConfig pooling_config     = std::get<2>(win_config);
+    _num_elems_processed_per_iteration = pooling_config.first;
+    _border_size                       = pooling_config.second;
+
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
 
     // Tensor paddings are used to calculate the indicies for MAX pooling
     if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
@@ -282,7 +279,8 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
     }
 
     // Check output dimensions
-    auto_init(input->info(), output->info(), pool_info);
+    auto_init(input->info(), output->info(), indices ? indices->info() : nullptr, pool_info);
+
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr));
 
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -314,19 +312,20 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
         build_opts.add_option("-DINITIAL_VALUE=0");
     }
 
-    const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
-    const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
-    const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
-    build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
-    build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
+    build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+    build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
 
     // Create kernel
     switch(_data_layout)
     {
         case DataLayout::NCHW:
         {
-            build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
-            build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
+            const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
+            const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
+            build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
+            build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
+
             if(pool_type != PoolingType::MAX)
             {
                 build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
@@ -365,26 +364,38 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
         }
         case DataLayout::NHWC:
         {
+            // Floating point mixed precision is support on F16 only
+            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+
+            // Wider accumulation is required to avoid accuracy loss
+            // Case 1: Floating point mixed precision (fp16 input data and fp32 accumulation)
+            // Cast 2: Quantized (int8/uint8 input data and int32 accumulation )
+            DataType acc_data_type = data_type;
+
+            if(use_fp_mixed_precision)
+            {
+                acc_data_type = DataType::F32;
+            }
+            else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
+            {
+                acc_data_type = DataType::S32;
+            }
+
+            build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
+            build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
             build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
-            build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
-            build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
-            build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
-                                     "-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
-            build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
-                                     "-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->tensor_shape().total_size_upper(3)));
-
-            if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
+            build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+            build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+            build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
+            build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(output->info()->dimension(idx_channel)));
+            build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch_size)));
+            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % _num_elems_processed_per_iteration));
+            if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
             {
-                if(data_type == DataType::F32)
-                {
-                    std::string kernel_name = "pooling_layer_2_nhwc_indices_fp32";
-                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-                }
-                else if(data_type == DataType::F16)
-                {
-                    std::string kernel_name = "pooling_layer_2_nhwc_indices_fp16";
-                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-                }
+                build_opts.add_option_if(_indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
+
+                std::string kernel_name = "pooling_layer_2x2_nhwc";
+                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
             }
             else
             {
@@ -410,6 +421,8 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
     _config_id += support::cpp11::to_string(output->info()->dimension(idx_channel));
     _config_id += "_";
     _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+
+    ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
 }
 
 Status CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
@@ -452,7 +465,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 unsigned int idx = 0;
                 add_3D_tensor_argument(idx, _input, in_slice);
                 add_3D_tensor_argument(idx, _output, slice);
-                if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+                if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
                 {
                     add_3D_tensor_argument(idx, _indices, slice);
                 }
@@ -463,14 +476,14 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         }
         case DataLayout::NHWC:
         {
-            const size_t total_batches = _output->info()->tensor_shape().total_size_upper(3);
+            const size_t batch_size = _output->info()->tensor_shape().total_size_upper(3);
 
             Window slice    = window_collapsed.first_slice_window_4D();
             Window in_slice = window_collapsed.first_slice_window_4D();
             in_slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration));
             in_slice.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
             in_slice.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
-            in_slice.set(3, Window::Dimension(0, total_batches, 1));
+            in_slice.set(3, Window::Dimension(0, batch_size, 1));
             do
             {
                 // Set inputs
diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/src/core/CL/kernels/CLPoolingLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
rename to src/core/CL/kernels/CLPoolingLayerKernel.h
index 85585e4587..d88402a792 100644
--- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H
 #define ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include "arm_compute/core/Error.h"
 
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index 3429ef75d1..7b9caf0063 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -21,17 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
+#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h
rename to src/core/CL/kernels/CLPriorBoxLayerKernel.h
index b4a69ac496..6c369a7a4e 100644
--- a/arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H
 #define ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index 2f676d30d1..3a66d084b9 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
rename to src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
index 51c50bc011..31085c37ba 100644
--- a/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H
 #define ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
index f6b08884e7..76e703f0dd 100644
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLQuantizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -52,26 +52,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps());
-
-    const int  vec_size_x     = 16 / input->element_size();
-    const int  input_width_x  = input->tensor_shape().x();
-    const bool multi_access_x = (input_width_x / vec_size_x > 0);
-    if(multi_access_x)
-    {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
-    }
-
-    Coordinates coord;
-    coord.set_num_dimensions(output->num_dimensions());
-    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
-    return std::make_pair(Status{}, win);
-}
 } // namespace
 
 CLQuantizationLayerKernel::CLQuantizationLayerKernel()
@@ -87,6 +67,9 @@ void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *out
 void CLQuantizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    auto padding_info = get_padding_info({ input, output });
+
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
@@ -96,11 +79,6 @@ void CLQuantizationLayerKernel::configure(const CLCompileContext &compile_contex
     const int  input_width_x  = input->info()->tensor_shape().x();
     const bool multi_access_x = (input_width_x / vec_size_x > 0);
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
     const UniformQuantizationInfo qinfo            = output->info()->quantization_info().uniform();
     const DataType                output_data_type = output->info()->data_type();
 
@@ -160,13 +138,23 @@ void CLQuantizationLayerKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
 
     _kernel = create_kernel(compile_context, "quantization_layer", build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    if(multi_access_x)
+    {
+        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
-
     return Status{};
 }
 
diff --git a/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h b/src/core/CL/kernels/CLQuantizationLayerKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h
rename to src/core/CL/kernels/CLQuantizationLayerKernel.h
index b0144bf8b0..e9d03decb3 100644
--- a/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 3f2a904f58..38eafc6e97 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -21,20 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
diff --git a/arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h
rename to src/core/CL/kernels/CLROIAlignLayerKernel.h
index 6a0468d331..cbf0e00165 100644
--- a/arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
 
 #include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index c2ed32653a..43492a3d50 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -21,19 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLArray.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h
rename to src/core/CL/kernels/CLROIPoolingLayerKernel.h
index ee422e10ee..35f42a9676 100644
--- a/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H
 #define ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include "arm_compute/core/CL/ICLArray.h"
 
diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index d46cdd78da..892f1c7c9f 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/arm_compute/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLRangeKernel.h
rename to src/core/CL/kernels/CLRangeKernel.h
index b5c64b2480..1b94a099ed 100644
--- a/arm_compute/core/CL/kernels/CLRangeKernel.h
+++ b/src/core/CL/kernels/CLRangeKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLRANGEKERNEL_H
 #define ARM_COMPUTE_CLRANGEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 0ba63cc4e0..9d49a2193a 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -21,19 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLReductionOperationKernel.h
rename to src/core/CL/kernels/CLReductionOperationKernel.h
index 2ecd1c9fd4..ff9fd61484 100644
--- a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
+++ b/src/core/CL/kernels/CLReductionOperationKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
 #define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
index fe8c81a3b9..0ebeefcc74 100644
--- a/src/core/CL/kernels/CLRemapKernel.cpp
+++ b/src/core/CL/kernels/CLRemapKernel.cpp
@@ -21,17 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
+#include "src/core/CL/kernels/CLRemapKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 
diff --git a/arm_compute/core/CL/kernels/CLRemapKernel.h b/src/core/CL/kernels/CLRemapKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLRemapKernel.h
rename to src/core/CL/kernels/CLRemapKernel.h
index fd261cd465..8efcf091ed 100644
--- a/arm_compute/core/CL/kernels/CLRemapKernel.h
+++ b/src/core/CL/kernels/CLRemapKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLREMAPKERNEL_H
 #define ARM_COMPUTE_CLREMAPKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index ab81a8fca3..662c790ca2 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp
@@ -21,17 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <string>
diff --git a/arm_compute/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLReorgLayerKernel.h
rename to src/core/CL/kernels/CLReorgLayerKernel.h
index e3edc9f724..455a6170c6 100644
--- a/arm_compute/core/CL/kernels/CLReorgLayerKernel.h
+++ b/src/core/CL/kernels/CLReorgLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLREORGLAYERKERNEL_H
 #define ARM_COMPUTE_CLREORGLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLReshapeLayerKernel.cpp b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
index 3daf21a9a7..58d7843624 100644
--- a/src/core/CL/kernels/CLReshapeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
@@ -21,20 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "src/core/CL/kernels/CLReshapeLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 
 #include <string>
 
@@ -62,6 +62,8 @@ void CLReshapeLayerKernel::configure(const CLCompileContext &compile_context, co
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output));
 
+    auto padding_info = get_padding_info({ input, output });
+
     // Create kernel
     std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->element_size()) };
     _kernel                          = create_kernel(compile_context, "reshape_layer", build_opts);
@@ -91,6 +93,8 @@ void CLReshapeLayerKernel::configure(const CLCompileContext &compile_context, co
     // Set the output valid region
     output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
     ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLReshapeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
diff --git a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h b/src/core/CL/kernels/CLReshapeLayerKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
rename to src/core/CL/kernels/CLReshapeLayerKernel.h
index 6e3f255c52..902c44649b 100644
--- a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
+++ b/src/core/CL/kernels/CLReshapeLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLRESHAPELAYERKERNEL_H
 #define ARM_COMPUTE_CLRESHAPELAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index 6546ced72e..9a876258e9 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp
@@ -21,16 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLReverseKernel.h
rename to src/core/CL/kernels/CLReverseKernel.h
index 17f1a4a20f..4a21e4f802 100644
--- a/arm_compute/core/CL/kernels/CLReverseKernel.h
+++ b/src/core/CL/kernels/CLReverseKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLREVERSEKERNEL_H
 #define ARM_COMPUTE_CLREVERSEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 2e7ee36bcb..5a7d5830fd 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -21,18 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+#include "src/core/CL/kernels/CLScaleKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include "src/core/utils/ScaleUtils.h"
diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/src/core/CL/kernels/CLScaleKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLScaleKernel.h
rename to src/core/CL/kernels/CLScaleKernel.h
index 79f7ed181a..a72e3938d9 100644
--- a/arm_compute/core/CL/kernels/CLScaleKernel.h
+++ b/src/core/CL/kernels/CLScaleKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLSCALEKERNEL_H
 #define ARM_COMPUTE_CLSCALEKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
index 3172966b8f..7ceddc9626 100644
--- a/src/core/CL/kernels/CLScharr3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
@@ -21,16 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
+#include "src/core/CL/kernels/CLScharr3x3Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <string>
diff --git a/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h b/src/core/CL/kernels/CLScharr3x3Kernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
rename to src/core/CL/kernels/CLScharr3x3Kernel.h
index 1af56a764e..a670da5b6f 100644
--- a/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
+++ b/src/core/CL/kernels/CLScharr3x3Kernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLSCHARR3X3KERNEL_H
 #define ARM_COMPUTE_CLSCHARR3X3KERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index 1244068816..53e5414c88 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp
@@ -21,16 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
@@ -123,7 +124,6 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(x->info()->data_type()));
-    build_opts.add_option("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(x->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
 
     // Create kernel
diff --git a/arm_compute/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLSelectKernel.h
rename to src/core/CL/kernels/CLSelectKernel.h
index 4015a273ea..93ae27f444 100644
--- a/arm_compute/core/CL/kernels/CLSelectKernel.h
+++ b/src/core/CL/kernels/CLSelectKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLSELECTKERNEL_H
 #define ARM_COMPUTE_CLSELECTKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
index 86dcf22258..a87677a38f 100644
--- a/src/core/CL/kernels/CLSobel3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
@@ -21,16 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
+#include "src/core/CL/kernels/CLSobel3x3Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h b/src/core/CL/kernels/CLSobel3x3Kernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
rename to src/core/CL/kernels/CLSobel3x3Kernel.h
index e24767852e..fed8068762 100644
--- a/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
+++ b/src/core/CL/kernels/CLSobel3x3Kernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLSOBEL3X3KERNEL_H
 #define ARM_COMPUTE_CLSOBEL3X3KERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
index e010fdda75..c450becd1d 100644
--- a/src/core/CL/kernels/CLSobel5x5Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
@@ -21,16 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h b/src/core/CL/kernels/CLSobel5x5Kernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
rename to src/core/CL/kernels/CLSobel5x5Kernel.h
index 82831ed14d..a163ac932a 100644
--- a/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
+++ b/src/core/CL/kernels/CLSobel5x5Kernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLSOBEL5X5KERNEL_H
 #define ARM_COMPUTE_CLSOBEL5X5KERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
index c2b4bec494..1cfa74f7b3 100644
--- a/src/core/CL/kernels/CLSobel7x7Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
@@ -21,16 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h b/src/core/CL/kernels/CLSobel7x7Kernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
rename to src/core/CL/kernels/CLSobel7x7Kernel.h
index d55993d1f0..c85f0aedf9 100644
--- a/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
+++ b/src/core/CL/kernels/CLSobel7x7Kernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLSOBEL7X7KERNEL_H
 #define ARM_COMPUTE_CLSOBEL7X7KERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index c7881b9f5f..526d9e187d 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -21,27 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CL/kernels/CLSoftmaxLayerKernel.h"
+
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 /** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
@@ -152,59 +141,6 @@ Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *su
 
     return Status{};
 }
-
-// Window validation
-
-std::pair<Status, Window> validate_and_configure_window_1DMaxShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
-{
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*sum, input->clone()->set_tensor_shape(max->tensor_shape()));
-    auto_init_if_empty(*output, *input->clone());
-
-    CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo parallel_reduction_info = CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(input->dimension(0));
-    unsigned int                                          vector_size             = std::get<1>(parallel_reduction_info);
-    const unsigned int                                    num_elems_x             = ceil_to_multiple(input->tensor_shape().x(), vector_size);
-    Window                                                win                     = calculate_max_window(*input, Steps(num_elems_x));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_x);
-    AccessWindowHorizontal max_access(max, 0, 1);
-    AccessWindowHorizontal output_access(output, 0, num_elems_x);
-    AccessWindowHorizontal sum_access(sum, 0, 1);
-
-    bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
-
-    output_access.set_valid_region(win, input->valid_region());
-    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window_1DNorm(ITensorInfo *input, ITensorInfo *output, ITensorInfo *sum, const SoftmaxKernelInfo &info)
-{
-    const DataType         output_data_type          = info.input_data_type;
-    const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output,
-                       input->clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     sum_access(sum, 0, 0, 1, sum->dimension(1));
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, sum_access, output_access);
-
-    output_access.set_valid_region(win, input->valid_region());
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
 } // namespace
 
 /**< Grid size (obtained through auto-tuning) */
@@ -228,6 +164,8 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
 
+    auto padding_info = get_padding_info({ input, max, output, sum });
+
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*sum->info(), input->info()->clone()->set_tensor_shape(max->info()->tensor_shape()));
     auto_init_if_empty(*output->info(), *input->info()->clone());
@@ -247,30 +185,31 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c
     const auto                    is_signed_qasymm8  = is_data_type_quantized_asymmetric_signed(info.input_data_type);
     const int                     min_value          = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
 
+    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
+    const unsigned int    vector_size             = adjust_vec_size(std::get<1>(parallel_reduction_info), reduction_dim_size);
+
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
     build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
+    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(reduction_dim_size));
+    build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(reduction_dim_size % vector_size));
+    build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
+    build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
     build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED");
-    build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
     build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX");
+    build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX")));
     build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
-    build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
 
     cl::NDRange lws_hint(cl::NullRange);
-    std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_serial") :
-                              std::string("softmax_layer_max_shift_exp_sum_serial");
-    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
-    unsigned int          vector_size             = std::get<1>(parallel_reduction_info);
-
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
-    build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
+    std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "");
 
     // Configure parallel kernel if needed
     if(std::get<0>(parallel_reduction_info))
     {
-        kernel_name            = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_parallel") : std::string("softmax_layer_max_shift_exp_sum_parallel");
+        kernel_name += "parallel";
         bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
         build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
 
@@ -281,25 +220,24 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c
         // A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0].
         lws_hint = cl::NDRange(_grid_size);
     }
+    else
+    {
+        kernel_name += "serial";
+    }
 
     // Create kernel.
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
-    // Set static arguments. Both the kernels use the same arguments
-    unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<cl_uint>(idx++, reduction_dim_size);
-
     // Configure window
-    auto win_config = validate_and_configure_window_1DMaxShiftExpSum(input->info(), max->info(), output->info(), sum->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second, lws_hint);
+    Window win = calculate_max_window(*(input->info()), Steps(reduction_dim_size));
+    ICLKernel::configure_internal(win, lws_hint);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(input, max, output, sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DMaxShiftExpSum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
-
     return Status{};
 }
 
@@ -322,9 +260,8 @@ void CLLogits1DMaxShiftExpSumKernel::run(const Window &window, cl::CommandQueue
     ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(_input->info()->dimension(0));
     if(std::get<0>(parallel_reduction_info))
     {
-        // To launch grid_size parallel workitems, steps.x should be modified as follows.
-        const unsigned int step = std::get<1>(parallel_reduction_info);
-        window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size * step, step));
+        // Launch grid_size parallel work items
+        window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1));
     }
 
     // Get slices
@@ -356,6 +293,8 @@ void CLLogits1DNormKernel::configure(const CLCompileContext &compile_context, co
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
 
+    auto padding_info = get_padding_info({ input, output, sum });
+
     // Note: output should always have a scale of 1/256 and offset 0
     const bool                    is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
     const DataType                output_data_type          = info.input_data_type;
@@ -373,32 +312,35 @@ void CLLogits1DNormKernel::configure(const CLCompileContext &compile_context, co
     _sum    = sum;
     _output = output;
 
-    const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
-    const int  min_value         = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
+    const auto         is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
+    const int          min_value         = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
+    const unsigned int vector_size       = adjust_vec_size(16, input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(info.input_data_type));
     build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
+    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+    build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size));
     build_opts.add_option_if(is_data_type_quantized_asymmetric_signed(info.input_data_type), "-DQASYMM8_SIGNED");
     build_opts.add_options_if(is_quantized_asymmetric,
                               prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options());
     build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
 
     // Create kernel
-    std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_norm_quantized" : "softmax_layer_norm";
+    std::string kernel_name = std::string("softmax_layer_norm") + (is_quantized_asymmetric ? "_quantized" : "");
     _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure window
-    auto win_config = validate_and_configure_window_1DNorm(input->info(), output->info(), sum->info(), info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    auto win = calculate_max_window(*(input->info()), Steps(vector_size));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLLogits1DNormKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(input, sum, output, info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DNorm(input->clone().get(), output->clone().get(), sum->clone().get(), info).first);
 
     return Status{};
 }
@@ -425,3 +367,4 @@ void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue)
     }
     while(window_collapsed.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/src/core/CL/kernels/CLSoftmaxLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
rename to src/core/CL/kernels/CLSoftmaxLayerKernel.h
index f8c1019d53..29e0f63e46 100644
--- a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
 #define ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple3DKernel.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/CL/ICLSimple3DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index 3e0ac74f69..7af0071025 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -35,15 +37,16 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *padddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, padddings, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(padddings->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(padddings->tensor_shape()[1] != block_info->tensor_shape()[0]);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
 
     // Validate output if initialized
     if(output->total_size() != 0)
@@ -52,6 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};
@@ -60,22 +64,15 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
                                  const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
 
     // Validate output if initialized
     if(output->total_size() != 0)
     {
-        const DataLayout data_layout = input->data_layout();
-        const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-        const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-        const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] < padding_left.x() + padding_right.y());
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) % block_shape_x != 0);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) % block_shape_y != 0);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
+        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -96,7 +93,7 @@ void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTenso
 
 void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
 
     _input       = input;
diff --git a/arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
similarity index 77%
rename from arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h
rename to src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
index 93221f7b5a..4817cfeef2 100644
--- a/arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
 #define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
@@ -50,8 +50,8 @@ class CLSpaceToBatchLayerKernel : public ICLKernel
     /** Initialise the kernel's inputs and output.
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in]  block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in]  paddings    2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output      Tensor output. Data types supported: same as @p input
      */
     void configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
@@ -59,8 +59,8 @@ class CLSpaceToBatchLayerKernel : public ICLKernel
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape     1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings        2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in]  block_shape     1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in]  paddings        2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
@@ -69,8 +69,8 @@ class CLSpaceToBatchLayerKernel : public ICLKernel
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
-     * @param[in]  padding_left  The left padding of the output tensor.
-     * @param[in]  padding_right The right padding of the output tensor.
+     * @param[in]  padding_left  The padding at the beginning of every dimension of the output tensor.
+     * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
     void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
@@ -80,16 +80,17 @@ class CLSpaceToBatchLayerKernel : public ICLKernel
      * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x   Block shape x value.
      * @param[in]  block_shape_y   Block shape y value.
-     * @param[in]  padding_left    The left padding of the output tensor.
-     * @param[in]  padding_right   The right padding of the output tensor.
+     * @param[in]  padding_left    The padding at the beginning of every dimension of the output tensor.
+     * @param[in]  padding_right   The padding at the end of every dimension of the output tensor.
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+                   ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in] paddings    2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in] block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in] paddings    2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[in] output      Tensor output. Data types supported: same as @p input
      *
      * @return a status
@@ -100,8 +101,8 @@ class CLSpaceToBatchLayerKernel : public ICLKernel
      * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in] block_shape_x Block shape x value.
      * @param[in] block_shape_y Block shape y value.
-     * @param[in] padding_left  The left padding of the output tensor.
-     * @param[in] padding_right The right padding of the output tensor.
+     * @param[in] padding_left  The padding at the beginning of every dimension of the output tensor.
+     * @param[in] padding_right The padding at the end of every dimension of the output tensor.
      * @param[in] output        Tensor output. Data types supported: same as @p input
      *
      * @return a status
@@ -113,8 +114,8 @@ class CLSpaceToBatchLayerKernel : public ICLKernel
 
 private:
     const ICLTensor *_input;       /**< Source tensor */
-    const ICLTensor *_block_shape; /**< Block shape tensor */
-    const ICLTensor *_paddings;    /**< Paddings tensor */
+    const ICLTensor *_block_shape; /**< Block shape tensor for dynamic evaluation */
+    const ICLTensor *_paddings;    /**< Paddings tensor for dynamic evaluation */
     ICLTensor       *_output;      /**< Destination tensor */
 };
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index 877d42681f..1c648e0944 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -75,7 +77,7 @@ void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_contex
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = compute_depth_to_space_shape(input->info(), block_shape);
+    TensorShape output_shape = compute_space_to_depth_shape(input->info(), block_shape);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
diff --git a/arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h
rename to src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
index af0aa12598..bb1ac5f9a6 100644
--- a/arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
 #define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index c283c440a3..9bdcc8dc3f 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp
@@ -21,17 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLStackLayerKernel.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/arm_compute/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLStackLayerKernel.h
rename to src/core/CL/kernels/CLStackLayerKernel.h
index cfefcd97dd..2865127a90 100644
--- a/arm_compute/core/CL/kernels/CLStackLayerKernel.h
+++ b/src/core/CL/kernels/CLStackLayerKernel.h
@@ -25,8 +25,8 @@
 #ifndef ARM_COMPUTE_CLSTACKLAYERKERNEL_H
 #define ARM_COMPUTE_CLSTACKLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index f7b7290a3f..c87fcb9765 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/utils/helpers/bit_ops.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/bit_ops.h"
+#include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLStridedSliceKernel.h
rename to src/core/CL/kernels/CLStridedSliceKernel.h
index 74311b71fa..599cf34c39 100644
--- a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
+++ b/src/core/CL/kernels/CLStridedSliceKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
 #define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 #include <cstdint>
 
diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp
index 3b8ca60ab1..b82f4c9889 100644
--- a/src/core/CL/kernels/CLTableLookupKernel.cpp
+++ b/src/core/CL/kernels/CLTableLookupKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+#include "src/core/CL/kernels/CLTableLookupKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLLut.h"
diff --git a/arm_compute/core/CL/kernels/CLTableLookupKernel.h b/src/core/CL/kernels/CLTableLookupKernel.h
similarity index 97%
rename from arm_compute/core/CL/kernels/CLTableLookupKernel.h
rename to src/core/CL/kernels/CLTableLookupKernel.h
index 9f1d28c47a..c8d15cbee2 100644
--- a/arm_compute/core/CL/kernels/CLTableLookupKernel.h
+++ b/src/core/CL/kernels/CLTableLookupKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLTABLELOOKUPKERNEL_H
 #define ARM_COMPUTE_CLTABLELOOKUPKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp
index de81644edd..72c22f043c 100644
--- a/src/core/CL/kernels/CLThresholdKernel.cpp
+++ b/src/core/CL/kernels/CLThresholdKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+#include "src/core/CL/kernels/CLThresholdKernel.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
diff --git a/arm_compute/core/CL/kernels/CLThresholdKernel.h b/src/core/CL/kernels/CLThresholdKernel.h
similarity index 97%
rename from arm_compute/core/CL/kernels/CLThresholdKernel.h
rename to src/core/CL/kernels/CLThresholdKernel.h
index 7e01fd6aaa..511eaed1bf 100644
--- a/arm_compute/core/CL/kernels/CLThresholdKernel.h
+++ b/src/core/CL/kernels/CLThresholdKernel.h
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_CLTHRESHOLDKERNEL_H
 #define ARM_COMPUTE_CLTHRESHOLDKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index bba152530c..c0c3d2e2ee 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLTileKernel.h
rename to src/core/CL/kernels/CLTileKernel.h
index 56e1df8de3..41752ca90b 100644
--- a/arm_compute/core/CL/kernels/CLTileKernel.h
+++ b/src/core/CL/kernels/CLTileKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLTILEKERNEL_H
 #define ARM_COMPUTE_CLTILEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index a47d956620..8d967e901f 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -21,20 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/AccessWindowTranspose.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 #include <sstream>
diff --git a/arm_compute/core/CL/kernels/CLTransposeKernel.h b/src/core/CL/kernels/CLTransposeKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLTransposeKernel.h
rename to src/core/CL/kernels/CLTransposeKernel.h
index 4a9887f2cf..0c4b7b4aff 100644
--- a/arm_compute/core/CL/kernels/CLTransposeKernel.h
+++ b/src/core/CL/kernels/CLTransposeKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLTRANSPOSEKERNEL_H
 #define ARM_COMPUTE_CLTRANSPOSEKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
index 101055001c..acb2fbcd04 100644
--- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
@@ -21,18 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h"
+#include "src/core/CL/kernels/CLUpsampleLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h b/src/core/CL/kernels/CLUpsampleLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
rename to src/core/CL/kernels/CLUpsampleLayerKernel.h
index b523b97233..f90ee07bf4 100644
--- a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLUPSAMPLELAYERKERNEL_H
 #define ARM_COMPUTE_CLUPSAMPLELAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
index e8da803628..600c67a528 100644
--- a/src/core/CL/kernels/CLWarpAffineKernel.cpp
+++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp
@@ -21,9 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
+#include "src/core/CL/kernels/CLWarpAffineKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -32,6 +31,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/arm_compute/core/CL/kernels/CLWarpAffineKernel.h b/src/core/CL/kernels/CLWarpAffineKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLWarpAffineKernel.h
rename to src/core/CL/kernels/CLWarpAffineKernel.h
index 440febab96..c600ee780d 100644
--- a/arm_compute/core/CL/kernels/CLWarpAffineKernel.h
+++ b/src/core/CL/kernels/CLWarpAffineKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLWARPAFFINEKERNEL_H
 #define ARM_COMPUTE_CLWARPAFFINEKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
index dc7c359849..5f20a0bdd3 100644
--- a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
+++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
@@ -21,9 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
+#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -32,6 +31,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
 #include <set>
diff --git a/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h b/src/core/CL/kernels/CLWarpPerspectiveKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
rename to src/core/CL/kernels/CLWarpPerspectiveKernel.h
index 6614989059..dcbe1c5560 100644
--- a/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
+++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H
 #define ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H
 
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLSimple2DKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index 267957e51a..559f47ce26 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -21,10 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -86,6 +88,8 @@ void CLWeightsReshapeKernel::configure(const CLCompileContext &compile_context,
                                                   (biases != nullptr) ? biases->info() : nullptr,
                                                   output->info(), num_groups));
 
+    auto padding_info = get_padding_info({ input, biases, output });
+
     const DataType data_type = input->info()->data_type();
 
     _biases = biases;
@@ -106,6 +110,8 @@ void CLWeightsReshapeKernel::configure(const CLCompileContext &compile_context,
     // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
     output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
     ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/src/core/CL/kernels/CLWeightsReshapeKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
rename to src/core/CL/kernels/CLWeightsReshapeKernel.h
index c74255bac0..402a60472b 100644
--- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
 #define ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
index 76100c2a63..d6697ba46b 100644
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
@@ -21,18 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/tensor_info.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/tensor_info.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
@@ -40,25 +39,6 @@ namespace arm_compute
 {
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 8;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
-    // The window needs to be based on the output
-    Window             win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-    AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration), input1->dimension(1));
-    const unsigned int input2_right_padding = ((output->dimension(0) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1->dimension(0) - input2->dimension(
-                                                   0)) % num_elems_processed_per_iteration;
-    AccessWindowStatic input2_access(input2, -(input1->dimension(0) % num_elems_processed_per_iteration),
-                                     0, input2->dimension(0) + input2_right_padding, input2->dimension(1));
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
-
-    Window win_collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win_collapsed);
-}
 Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
@@ -81,7 +61,6 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 Status CLWidthConcatenate2TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
     return Status{};
 }
 
@@ -90,13 +69,22 @@ void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output));
 
+    auto padding_info = get_padding_info({ input1, input2, output });
+
+    const unsigned int min_dimension                     = std::min(input1->dimension(0), input2->dimension(0));
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
+    const unsigned int vec_size_leftover                 = output->dimension(0) % num_elems_processed_per_iteration;
+
     // Add build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->dimension(2)));
     build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->dimension(0)));
+    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->dimension(0)));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->element_size()));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((input1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
 
     // If input have different quantization info set quantization parameters needed for the re-quantization process
     const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output, input1, input2);
@@ -118,21 +106,12 @@ void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile
     _kernel = create_kernel(compile_context, "concatenate_width_x2", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    ICLKernel::configure_internal(std::get<1>(win_config));
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
 
     // Set output valid region
     output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
-    // Pass paddings as arguments to the kernel
-    const unsigned int input1_width         = input1->dimension(0);
-    const unsigned int input1_right_padding = ceil_to_multiple(input1_width, num_elems_processed_per_iteration) - input1_width;
-    const unsigned int input2_left_padding  = input1_width % num_elems_processed_per_iteration;
-    unsigned int       idx0                 = 3 * num_arguments_per_4D_tensor();
-    _kernel.setArg<cl_uint>(idx0++, input1_right_padding);
-    _kernel.setArg<cl_uint>(idx0++, input2_left_padding);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 
     // Set config_id for enabling LWS tuning
     _config_id = "concatenate_width_x2_";
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
rename to src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
index a379b5f0b8..2af89e12eb 100644
--- a/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
@@ -25,8 +25,8 @@
 #ifndef ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H
 #define ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
index 0377eb76b1..7ecdd30224 100644
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
@@ -21,19 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/tensor_info.h"
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/tensor_info.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
@@ -41,41 +39,6 @@ namespace arm_compute
 {
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 8;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output)
-{
-    const unsigned int input1_width = input1->dimension(0);
-    const unsigned int input2_width = input2->dimension(0);
-    const unsigned int input3_width = input3->dimension(0);
-    const unsigned int input4_width = input4->dimension(0);
-
-    // The window needs to be based on the output
-    Window             win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-    AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1_width, num_elems_processed_per_iteration), input1->dimension(1));
-
-    const unsigned int input2_left_padding  = input1_width % num_elems_processed_per_iteration;
-    const unsigned int input2_right_padding = ((input1_width + input2_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width + num_elems_processed_per_iteration -
-                                              input2_width;
-    AccessWindowStatic input2_access(input2, -input2_left_padding, 0, input2_width + input2_right_padding, input2->dimension(1));
-
-    const unsigned int input3_left_padding  = (input1_width + input2_width) % num_elems_processed_per_iteration;
-    const unsigned int input3_right_padding = ((input1_width + input2_width + input3_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width - input2_width +
-                                              num_elems_processed_per_iteration - input3_width;
-    AccessWindowStatic input3_access(input3, -input3_left_padding, 0, input3_width + input3_right_padding, input3->dimension(1));
-
-    const unsigned int input4_left_padding  = (input1_width + input2_width + input3_width) % num_elems_processed_per_iteration;
-    const unsigned int input4_right_padding = (output->dimension(0) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration + num_elems_processed_per_iteration - output->dimension(0);
-    AccessWindowStatic input4_access(input4, -input4_left_padding, 0, input4_width + input4_right_padding, input4->dimension(1));
-
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input1_access, input2_access, input3_access, input4_access, output_access);
-
-    Window win_collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win_collapsed);
-}
 Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
@@ -104,7 +67,6 @@ CLWidthConcatenate4TensorsKernel::CLWidthConcatenate4TensorsKernel()
 Status CLWidthConcatenate4TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, input3, input4, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), input3->clone().get(), input4->clone().get(), output->clone().get()).first);
     return Status{};
 }
 
@@ -116,15 +78,25 @@ void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, input3, input4, output));
 
+    auto               padding_info                      = get_padding_info({ input1, input2, input3, input4, output });
+    const unsigned int min_dimension                     = std::min(std::min(input1->dimension(0), input2->dimension(0)), std::min(input3->dimension(0), input4->dimension(0)));
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
+    const unsigned int vec_size_leftover                 = output->dimension(0) % num_elems_processed_per_iteration;
+
     // Add build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->dimension(2)));
     build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->dimension(0)));
     build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->dimension(0)));
     build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->dimension(0)));
+    build_opts.add_option("-DINPUT4_WIDTH=" + support::cpp11::to_string(input4->dimension(0)));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->element_size()));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((input1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((input1->dimension(0) + input2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((input1->dimension(0) + input2->dimension(0) + input3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
 
     // If input have different quantization info set quantization parameters needed for the re-quantization process
     const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output, input1, input2, input3, input4);
@@ -152,34 +124,12 @@ void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile
     _kernel = create_kernel(compile_context, "concatenate_width_x4", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input1, input2, input3, input4, output);
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    ICLKernel::configure_internal(std::get<1>(win_config));
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
 
     // Set output valid region
     output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
-    // Pass paddings as arguments to the kernel
-    const unsigned int input1_width = input1->dimension(0);
-    const unsigned int input2_width = input2->dimension(0);
-    const unsigned int input3_width = input3->dimension(0);
-
-    const unsigned int input1_right_padding = ceil_to_multiple(input1_width, num_elems_processed_per_iteration) - input1_width;
-    const unsigned int input2_left_padding  = input1_width % num_elems_processed_per_iteration;
-    const unsigned int input2_right_padding = ((input1_width + input2_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width + num_elems_processed_per_iteration -
-                                              input2_width;
-    const unsigned int input3_left_padding  = (input1_width + input2_width) % num_elems_processed_per_iteration;
-    const unsigned int input3_right_padding = ((input1_width + input2_width + input3_width) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1_width - input2_width +
-                                              num_elems_processed_per_iteration - input3_width;
-    const unsigned int input4_left_padding  = (input1_width + input2_width + input3_width) % num_elems_processed_per_iteration;
-    unsigned int       idx0                 = 5 * num_arguments_per_4D_tensor();
-    _kernel.setArg<cl_uint>(idx0++, input1_right_padding);
-    _kernel.setArg<cl_uint>(idx0++, input2_left_padding);
-    _kernel.setArg<cl_uint>(idx0++, input2_right_padding);
-    _kernel.setArg<cl_uint>(idx0++, input3_left_padding);
-    _kernel.setArg<cl_uint>(idx0++, input3_right_padding);
-    _kernel.setArg<cl_uint>(idx0++, input4_left_padding);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 
     // Set config_id for enabling LWS tuning
     _config_id = "concatenate_width_x4_";
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
similarity index 98%
rename from arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
rename to src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
index 6b0e8ee21d..0caf87114d 100644
--- a/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
@@ -25,8 +25,8 @@
 #ifndef ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H
 #define ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
index d40597fbb5..30d0a481bd 100644
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
@@ -21,17 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
@@ -39,21 +38,6 @@ namespace arm_compute
 {
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int width_offset, ITensorInfo *output)
-{
-    // The window needs to be based on input as we copy all the widths of input
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, width_offset, num_elems_processed_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
-
-    Window win_collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win_collapsed);
-}
 Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -74,14 +58,12 @@ Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, c
 } // namespace
 
 CLWidthConcatenateLayerKernel::CLWidthConcatenateLayerKernel()
-    : _width_offset(0)
 {
 }
 
 Status CLWidthConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, width_offset, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), width_offset, output->clone().get()).first);
     return Status{};
 }
 
@@ -90,13 +72,16 @@ void CLWidthConcatenateLayerKernel::configure(const CLCompileContext &compile_co
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, width_offset, output));
 
-    _width_offset = width_offset;
+    auto padding_info = get_padding_info({ input, output });
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, input->dimension(0));
 
     // Add build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(_width_offset));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset));
     build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->dimension(2)));
 
     if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
@@ -113,13 +98,13 @@ void CLWidthConcatenateLayerKernel::configure(const CLCompileContext &compile_co
     // Create kernel
     _kernel = create_kernel(compile_context, "concatenate_width", build_opts.options());
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input, width_offset, output);
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    ICLKernel::configure_internal(std::get<1>(win_config));
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
 
     // Set output valid region
     output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 void CLWidthConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.h
similarity index 97%
rename from arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
rename to src/core/CL/kernels/CLWidthConcatenateLayerKernel.h
index 32e90af404..09c3f4455d 100644
--- a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.h
@@ -25,8 +25,8 @@
 #ifndef ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H
 #define ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
@@ -69,9 +69,6 @@ class CLWidthConcatenateLayerKernel : public ICLKernel
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    unsigned int _width_offset;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H */
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
index 4a1c48a258..bd45ddb65f 100644
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
@@ -21,12 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -36,12 +34,17 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
-using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
@@ -74,23 +77,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(output);
 
     const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
     const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
     const unsigned int num_elems_read_per_iteration_z      = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
 
-    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
-    bool   window_changed = false;
-
-    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    AccessWindowStatic    output_access(output, 0, 0, output->dimension(0), output->dimension(1));
-    window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
+    Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
     Window win_collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win_collapsed);
+    return std::make_pair(Status{}, win_collapsed);
 }
 } // namespace
 
@@ -112,6 +107,7 @@ void CLWinogradFilterTransformKernel::configure(const CLCompileContext &compile_
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input->info(), winograd_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info));
+    auto padding_info = get_padding_info({ input, output });
 
     // Set build options
     CLBuildOptions build_opts;
@@ -133,6 +129,7 @@ void CLWinogradFilterTransformKernel::configure(const CLCompileContext &compile_
     auto win_config = validate_and_configure_window(input->info(), output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLWinogradFilterTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
@@ -157,3 +154,4 @@ void CLWinogradFilterTransformKernel::run(const Window &window, cl::CommandQueue
     add_3D_tensor_argument(idx, _output, window_out);
     enqueue(queue, *this, window, lws_hint());
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h b/src/core/CL/kernels/CLWinogradFilterTransformKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h
rename to src/core/CL/kernels/CLWinogradFilterTransformKernel.h
index b689be820f..d22fedebcd 100644
--- a/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H
 #define ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index 6b1b86a777..695e1cbbf1 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -21,12 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradInputTransformKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
@@ -34,6 +32,10 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
@@ -87,11 +89,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
         window_changed = update_window_and_padding(win, input_access);
     }
-    else
-    {
-        AccessWindowStatic input_access(input, 0, -1, input->dimension(0), input->dimension(1) + 1);
-        window_changed = update_window_and_padding(win, input_access);
-    }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
@@ -118,6 +115,8 @@ void CLWinogradInputTransformKernel::configure(const CLCompileContext &compile_c
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info));
 
+    auto padding_info = get_padding_info({ input, output });
+
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const Size2D        kernel_size      = winograd_info.kernel_size;
@@ -141,7 +140,7 @@ void CLWinogradInputTransformKernel::configure(const CLCompileContext &compile_c
     }
     else
     {
-        _border_size = BorderSize(1U, 0U, 1U, 0);
+        _border_size = BorderSize();
     }
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
@@ -207,6 +206,8 @@ void CLWinogradInputTransformKernel::configure(const CLCompileContext &compile_c
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8));
 
+    ARM_COMPUTE_ERROR_ON((input->info()->data_layout() == DataLayout::NHWC) && has_padding_changed(padding_info));
+
     _config_id = kernel_name;
     _config_id += support::cpp11::to_string(input->info()->dimension(0));
     _config_id += "_";
diff --git a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h b/src/core/CL/kernels/CLWinogradInputTransformKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
rename to src/core/CL/kernels/CLWinogradInputTransformKernel.h
index 4f198f034a..25301877e6 100644
--- a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H
 #define ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index 19f61b19b3..2018559f60 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -21,12 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -36,6 +34,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
@@ -94,35 +96,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(bias);
 
     constexpr unsigned int num_elems_processed_per_iteration = 1;
 
     Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     bool   window_changed = false;
 
-    int output_static_window_end_x = 0;
-    int output_static_window_end_y = 0;
-
     if(output->data_layout() == DataLayout::NCHW)
     {
-        output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
-        output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
-    }
-    else
-    {
-        output_static_window_end_x = output->dimension(0);
-        output_static_window_end_y = std::max(ceil_to_multiple(output->dimension(1), output_tile_size.width), output->dimension(1) + 1 /* For out of bound reads towards the z axis */);
-    }
-
-    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
-    AccessWindowStatic    output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
-    window_changed = update_window_and_padding(win, input_access, output_access);
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+        const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
+        const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
 
-    if(bias != nullptr)
-    {
-        AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
-        window_changed = window_changed || update_window_and_padding(win, bias_access);
+        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+        AccessWindowStatic    output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
+        window_changed = update_window_and_padding(win, input_access, output_access);
+        output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
     }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -150,6 +139,8 @@ void CLWinogradOutputTransformKernel::configure(const CLCompileContext &compile_
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info, act_info));
 
+    auto padding_info = get_padding_info({ input, bias, output });
+
     _input   = input;
     _bias    = bias;
     _output  = output;
@@ -160,6 +151,8 @@ void CLWinogradOutputTransformKernel::configure(const CLCompileContext &compile_
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const int           idx_width        = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
+    const int           idx_height       = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
     const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
@@ -188,6 +181,8 @@ void CLWinogradOutputTransformKernel::configure(const CLCompileContext &compile_
     build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
     build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(idx_width)));
+    build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(idx_height)));
     build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
     build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
     build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
@@ -215,6 +210,8 @@ void CLWinogradOutputTransformKernel::configure(const CLCompileContext &compile_
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
     _config_id += "_";
     _config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc);
 }
 
 Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
diff --git a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h b/src/core/CL/kernels/CLWinogradOutputTransformKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
rename to src/core/CL/kernels/CLWinogradOutputTransformKernel.h
index f7cbd05020..632a5629d9 100644
--- a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H
 #define ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CL/kernels/CLYOLOLayerKernel.cpp b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
index 3dd9aa23ce..e12d1e7a65 100644
--- a/src/core/CL/kernels/CLYOLOLayerKernel.cpp
+++ b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
@@ -21,20 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
+#include "src/core/CL/kernels/CLYOLOLayerKernel.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/Types.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -123,7 +125,6 @@ void CLYOLOLayerKernel::configure(const CLCompileContext &compile_context, ICLTe
     CLBuildOptions build_opts;
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
-    build_opts.add_option("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(dt));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(a_const));
     build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(b_const));
diff --git a/arm_compute/core/CL/kernels/CLYOLOLayerKernel.h b/src/core/CL/kernels/CLYOLOLayerKernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/CLYOLOLayerKernel.h
rename to src/core/CL/kernels/CLYOLOLayerKernel.h
index 52b069868e..5b1d56e9e5 100644
--- a/arm_compute/core/CL/kernels/CLYOLOLayerKernel.h
+++ b/src/core/CL/kernels/CLYOLOLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_CLYOLOLAYERKERNEL_H
 #define ARM_COMPUTE_CLYOLOLAYERKERNEL_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h b/src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
similarity index 99%
rename from arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
rename to src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
index d182e386b8..4c92ae417f 100644
--- a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
+++ b/src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H
 #define ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/CPP/ICPPSimpleKernel.cpp b/src/core/CPP/ICPPSimpleKernel.cpp
index 126bf548e2..9e4df5ec8a 100644
--- a/src/core/CPP/ICPPSimpleKernel.cpp
+++ b/src/core/CPP/ICPPSimpleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
@@ -71,4 +72,4 @@ Status ICPPSimpleKernel::validate(const ITensorInfo *input, const ITensorInfo *o
     return Status{};
 }
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/arm_compute/core/CPP/Validate.h b/src/core/CPP/Validate.h
similarity index 100%
rename from arm_compute/core/CPP/Validate.h
rename to src/core/CPP/Validate.h
diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
index 917a6ad08b..fb1754247c 100644
--- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <cmath>
diff --git a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
index a0cfb3ba8b..a134e3e5c1 100644
--- a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
+++ b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,17 +23,9 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
 
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/Mutex.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
@@ -92,14 +84,14 @@ void CPPCornerCandidatesKernel::configure(const IImage *input, InternalKeypoint
 
     update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
 
-    INEKernel::configure(win);
+    ICPPKernel::configure(win);
 }
 
 void CPPCornerCandidatesKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
     Iterator input(_input, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
diff --git a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
index ec03b72b6b..3166faba48 100644
--- a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 
 #include <algorithm>
diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
index 89e3058520..c1187ff2b3 100644
--- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
@@ -23,10 +23,12 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <algorithm>
 
 namespace arm_compute
diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 1d1f0cd30e..054c7bf05a 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp
@@ -23,13 +23,10 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
 #include <cstdint>
diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp
index 7ba8d7cdd0..d2b54e412e 100644
--- a/src/core/CPP/kernels/CPPTopKVKernel.cpp
+++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp
@@ -22,16 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 namespace arm_compute
 {
 namespace
diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
index ff4ffb6124..7ef83fb2c4 100644
--- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
@@ -23,13 +23,8 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
 #include <cstdint>
diff --git a/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp
index 6609f457e2..fb31ac8377 100644
--- a/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp
+++ b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
index f0a500398b..5e8accc95d 100644
--- a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
@@ -32,6 +32,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
index 1c02f41286..0173b81cf8 100644
--- a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
index 06c34863d7..f31c8ca156 100644
--- a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
@@ -34,6 +34,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
index 3bd34acb92..9281ce5ffb 100644
--- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
@@ -31,6 +30,9 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
index 4fe6484cf8..5781c564ea 100644
--- a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
@@ -24,7 +24,6 @@
 
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
@@ -42,7 +44,7 @@ GCCol2ImKernel::GCCol2ImKernel()
 {
 }
 
-void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor *output,
+void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor    *output,
                                std::pair<unsigned int, unsigned int> convolved_dims)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
index 458cb639a3..3256f11e74 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
index cb70dae3ec..95d487b4dd 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -34,6 +33,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index 302b21be0d..9ce8acea09 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
@@ -44,7 +46,7 @@ GCDirectConvolutionLayerKernel<kernel_size>::GCDirectConvolutionLayerKernel()
 }
 
 template <unsigned int kernel_size>
-BorderSize GCDirectConvolutionLayerKernel<kernel_size>::border_size() const
+BorderSize             GCDirectConvolutionLayerKernel<kernel_size>::border_size() const
 {
     return _border_size;
 }
@@ -70,8 +72,8 @@ void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *inp
     }
 
     // Get convolved dimensions
-    unsigned int owidth       = 0;
-    unsigned int oheight      = 0;
+    unsigned int owidth  = 0;
+    unsigned int oheight = 0;
     std::tie(owidth, oheight) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);
 
     TensorShape output_shape = input->info()->tensor_shape();
@@ -238,20 +240,20 @@ void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *inp
                 num_elems_written_per_iteration_x = 4;
 #elif defined(PROCESS_4X_2Y_1Z)
                 options.emplace("#define PROCESS_4X_2Y_1Z");
-                num_elems_read_per_iteration_x = 4;
-                num_elems_read_per_iteration_y = 2;
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_read_per_iteration_y    = 2;
                 num_elems_written_per_iteration_x = 4;
                 num_elems_written_per_iteration_y = 2;
 #elif defined(PROCESS_4X_3Y_1Z)
                 options.emplace("#define PROCESS_4X_3Y_1Z");
-                num_elems_read_per_iteration_x = 4;
-                num_elems_read_per_iteration_y = 3;
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_read_per_iteration_y    = 3;
                 num_elems_written_per_iteration_x = 4;
                 num_elems_written_per_iteration_y = 3;
 #elif defined(PROCESS_4X_4Y_1Z)
                 options.emplace("#define PROCESS_4X_4Y_1Z");
-                num_elems_read_per_iteration_x = 4;
-                num_elems_read_per_iteration_y = 4;
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_read_per_iteration_y    = 4;
                 num_elems_written_per_iteration_x = 4;
                 num_elems_written_per_iteration_y = 4;
 #elif defined(PROCESS_4X_2Y_2Z)
diff --git a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
index 5c6722af6a..bda6599f86 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
index 3b3118bc3d..7ffcdd2f3f 100644
--- a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cstdint>
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
index e0f7e957d8..d395759558 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
index c9eb4337fa..66fdde5473 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
index e8298bc327..daad70bba9 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
index dd03faf2df..2f69728b61 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
@@ -23,8 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -37,6 +35,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/AccessWindowTranspose.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
index 4190163694..1d6ef3d0e8 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowTranspose.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
index 64f2d63fec..c12dd38cb4 100644
--- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
@@ -24,7 +24,6 @@
 
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -35,6 +34,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
@@ -91,7 +93,7 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const
     int stride_y = 0;
 
     std::tie(stride_x, stride_y) = conv_info.stride();
-    _kernel_dims                 = std::make_pair(kernel_dims.width, kernel_dims.height);
+    _kernel_dims = std::make_pair(kernel_dims.width, kernel_dims.height);
 
     const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
                                      && (std::equal(input->info()->tensor_shape().cbegin() + 3,
@@ -109,9 +111,9 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const
         }
 
         build_opts.emplace("#define IM2COL_GENERIC");
-        _convolved_dims                    = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
-                                                               kernel_dims.width, kernel_dims.height,
-                                                               conv_info, dilation);
+        _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+                                            kernel_dims.width, kernel_dims.height,
+                                            conv_info, dilation);
         _num_elems_processed_per_iteration = (input->info()->data_type() == DataType::F32) ? 1 : 2;
 
         build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.width));
diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
index 5fa1987bf1..c29d9fc4d5 100644
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
@@ -31,6 +31,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <string>
diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
index 6a79990484..971b540a83 100644
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
@@ -31,6 +30,9 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
index 45aa06cc2d..76559146ae 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <cmath>
diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
index a592c09cc0..13efd10532 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
index cf10b92dd1..a0795c668f 100644
--- a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
index f4ed9617fa..39d586da72 100644
--- a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
index d06be9b8a6..78b008484e 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -33,6 +32,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
index 66b4a55bd8..3bec05b5f1 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -31,6 +30,9 @@
 #include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include <set>
diff --git a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
index 9a430b43cb..bcdbfb60dc 100644
--- a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index bfc4a8d101..e692cc1e7c 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,166 +23,10 @@
  */
 #include "arm_compute/core/Helpers.h"
 
-using namespace arm_compute;
-
-Window arm_compute::calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
-{
-    if(!skip_border)
-    {
-        border_size = BorderSize(0);
-    }
-
-    const Coordinates &anchor = valid_region.anchor;
-    const TensorShape &shape  = valid_region.shape;
-
-    Window window;
-
-    window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   anchor[0] + border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
-
-    size_t n = 1;
-
-    if(anchor.num_dimensions() > 1)
-    {
-        window.set(1, Window::Dimension(
-                       // Skip the border above the image
-                       anchor[1] + border_size.top,
-                       // Skip the border below the image
-                       anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
-                       steps[1]));
-
-        ++n;
-    }
-
-    if(anchor.num_dimensions() > 2)
-    {
-        window.set(2, Window::Dimension(anchor[2], std::max<size_t>(1, shape[2]), steps[2]));
-
-        ++n;
-    }
-
-    for(; n < anchor.num_dimensions(); ++n)
-    {
-        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
-    }
-
-    for(; n < Coordinates::num_max_dimensions; ++n)
-    {
-        window.set(n, Window::Dimension(0, 1));
-    }
-
-    return window;
-}
-
-Window arm_compute::calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps, BorderSize border_size)
-{
-    const Coordinates &anchor = valid_region.anchor;
-    const TensorShape &shape  = valid_region.shape;
-
-    Window window;
-
-    window.set(0, Window::Dimension(
-                   // move the anchor to the start from the border
-                   anchor[0] - border_size.left,
-                   // move the anchor to include the right end border
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
-                   steps[0]));
-
-    size_t n = 1;
-
-    if(anchor.num_dimensions() > 1)
-    {
-        window.set(1, Window::Dimension(
-                       // Include the border above the image
-                       anchor[1] - border_size.top,
-                       // Include the border below the image
-                       anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
-                       steps[1]));
-
-        ++n;
-    }
-
-    if(anchor.num_dimensions() > 2)
-    {
-        window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[n]), steps[2]));
-
-        ++n;
-    }
-
-    for(; n < anchor.num_dimensions(); ++n)
-    {
-        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
-    }
-
-    for(; n < Coordinates::num_max_dimensions; ++n)
-    {
-        window.set(n, Window::Dimension(0, 1));
-    }
-
-    return window;
-}
-
-Window arm_compute::calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+namespace arm_compute
 {
-    if(skip_border)
-    {
-        border_size.top    = 0;
-        border_size.bottom = 0;
-    }
-    else
-    {
-        border_size.left  = 0;
-        border_size.right = 0;
-    }
-
-    const Coordinates &anchor = valid_region.anchor;
-    const TensorShape &shape  = valid_region.shape;
-
-    Window window;
-
-    window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   anchor[0] + border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
-
-    size_t n = 1;
-
-    if(anchor.num_dimensions() > 1)
-    {
-        window.set(1, Window::Dimension(
-                       // Skip the border above the image
-                       anchor[1] - border_size.top,
-                       // Skip the border below the image
-                       anchor[1] + shape[1] + border_size.bottom,
-                       1));
-
-        ++n;
-    }
-
-    for(; n < anchor.num_dimensions(); ++n)
-    {
-        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
-    }
-
-    for(; n < Coordinates::num_max_dimensions; ++n)
-    {
-        window.set(n, Window::Dimension(0, 1));
-    }
-
-    return window;
-}
-
-ValidRegion arm_compute::calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
-                                                      InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined)
+ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
+                                         InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined)
 {
     const DataLayout data_layout = src_info.data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -246,7 +90,7 @@ ValidRegion arm_compute::calculate_valid_region_scale(const ITensorInfo &src_inf
     }
 
     // Setup output valid region
-    ValidRegion valid_region{ Coordinates(), dst_shape, src_info.tensor_shape().num_dimensions() };
+    ValidRegion valid_region{ Coordinates(), dst_shape, dst_shape.num_dimensions() };
 
     valid_region.anchor.set(idx_width, std::max(0, valid_start_out_x));
     valid_region.anchor.set(idx_height, std::max(0, valid_start_out_y));
@@ -255,4 +99,5 @@ ValidRegion arm_compute::calculate_valid_region_scale(const ITensorInfo &src_inf
     valid_region.shape.set(idx_height, std::min<size_t>(valid_end_out_y - valid_start_out_y, dst_shape[idx_height]));
 
     return valid_region;
-}
\ No newline at end of file
+}
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/KernelTypes.h b/src/core/KernelTypes.h
new file mode 100644
index 0000000000..12e6bc90ae
--- /dev/null
+++ b/src/core/KernelTypes.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_KERNEL_TYPES_H
+#define ARM_COMPUTE_KERNEL_TYPES_H
+
+namespace arm_compute
+{
+namespace kernels
+{
+/** List of supported logical operations */
+enum class LogicalOperation
+{
+    Unknown, /**< Unknown */
+    And,     /**< Logical And && */
+    Or,      /**< Logical Or || */
+    Not,     /**< Logical Not ! */
+};
+} // namespace kernels
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_KERNEL_TYPES_H */
diff --git a/arm_compute/core/NEON/INEKernel.h b/src/core/NEON/INEKernel.h
similarity index 97%
rename from arm_compute/core/NEON/INEKernel.h
rename to src/core/NEON/INEKernel.h
index 87e17c80b4..7ad20166d8 100644
--- a/arm_compute/core/NEON/INEKernel.h
+++ b/src/core/NEON/INEKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/INESimpleKernel.h b/src/core/NEON/INESimpleKernel.h
similarity index 97%
rename from arm_compute/core/NEON/INESimpleKernel.h
rename to src/core/NEON/INESimpleKernel.h
index abe15c15c3..da32d6619e 100644
--- a/arm_compute/core/NEON/INESimpleKernel.h
+++ b/src/core/NEON/INESimpleKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
similarity index 99%
rename from arm_compute/core/NEON/NEAsymm.h
rename to src/core/NEON/NEAsymm.h
index d5d824e9ca..70d48d5835 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/src/core/NEON/NEAsymm.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEASYMM_H
 #define ARM_COMPUTE_NEASYMM_H
 
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -749,5 +749,5 @@ inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQua
     return { pa, pb };
 }
 } // namespace arm_compute
-#include "arm_compute/core/NEON/NEAsymm.inl"
+#include "src/core/NEON/NEAsymm.inl"
 #endif // ARM_COMPUTE_NEASYMM_H
diff --git a/arm_compute/core/NEON/NEAsymm.inl b/src/core/NEON/NEAsymm.inl
similarity index 99%
rename from arm_compute/core/NEON/NEAsymm.inl
rename to src/core/NEON/NEAsymm.inl
index d211382f7a..6ee1a336b8 100644
--- a/arm_compute/core/NEON/NEAsymm.inl
+++ b/src/core/NEON/NEAsymm.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/src/core/NEON/NEFixedPoint.h
similarity index 92%
rename from arm_compute/core/NEON/NEFixedPoint.h
rename to src/core/NEON/NEFixedPoint.h
index 5758264b9a..5c49b25c3e 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/src/core/NEON/NEFixedPoint.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,5 +37,5 @@ namespace arm_compute
  */
 float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b);
 } // namespace arm_compute
-#include "arm_compute/core/NEON/NEFixedPoint.inl"
+#include "src/core/NEON/NEFixedPoint.inl"
 #endif /* ARM_COMPUTE_NEFIXEDPOINT_H */
\ No newline at end of file
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/src/core/NEON/NEFixedPoint.inl
similarity index 97%
rename from arm_compute/core/NEON/NEFixedPoint.inl
rename to src/core/NEON/NEFixedPoint.inl
index c2c2b25fef..8bff9c4a8e 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/src/core/NEON/NEFixedPoint.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
new file mode 100644
index 0000000000..88fb8d4023
--- /dev/null
+++ b/src/core/NEON/NEKernels.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2016-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEKERNELS_H
+#define ARM_COMPUTE_NEKERNELS_H
+
+/* Header regrouping all the NEON kernels */
+#include "src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+#include "src/core/NEON/kernels/NEAccumulateKernel.h"
+#include "src/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "src/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "src/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "src/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
+#include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
+#include "src/core/NEON/kernels/NEBox3x3Kernel.h"
+#include "src/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "src/core/NEON/kernels/NEChannelCombineKernel.h"
+#include "src/core/NEON/kernels/NEChannelExtractKernel.h"
+#include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
+#include "src/core/NEON/kernels/NECol2ImKernel.h"
+#include "src/core/NEON/kernels/NEColorConvertKernel.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEConvolutionKernel.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NECropKernel.h"
+#include "src/core/NEON/kernels/NECumulativeDistributionKernel.h"
+#include "src/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/NEON/kernels/NEDequantizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEDerivativeKernel.h"
+#include "src/core/NEON/kernels/NEDilateKernel.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEElementwiseOperationKernel.h"
+#include "src/core/NEON/kernels/NEElementwiseUnaryKernel.h"
+#include "src/core/NEON/kernels/NEErodeKernel.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
+#include "src/core/NEON/kernels/NEFastCornersKernel.h"
+#include "src/core/NEON/kernels/NEFillArrayKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "src/core/NEON/kernels/NEFloorKernel.h"
+#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEGatherKernel.h"
+#include "src/core/NEON/kernels/NEGaussian3x3Kernel.h"
+#include "src/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "src/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
+#include "src/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "src/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "src/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "src/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEHistogramKernel.h"
+#include "src/core/NEON/kernels/NEIm2ColKernel.h"
+#include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEIntegralImageKernel.h"
+#include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
+#include "src/core/NEON/kernels/NELKTrackerKernel.h"
+#include "src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NELogicalKernel.h"
+#include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEMedian3x3Kernel.h"
+#include "src/core/NEON/kernels/NEMemsetKernel.h"
+#include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
+#include "src/core/NEON/kernels/NEMinMaxLocationKernel.h"
+#include "src/core/NEON/kernels/NENonLinearFilterKernel.h"
+#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
+#include "src/core/NEON/kernels/NEPermuteKernel.h"
+#include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEQuantizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
+#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NERangeKernel.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/NEON/kernels/NERemapKernel.h"
+#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
+#include "src/core/NEON/kernels/NEReshapeLayerKernel.h"
+#include "src/core/NEON/kernels/NEReverseKernel.h"
+#include "src/core/NEON/kernels/NEScaleKernel.h"
+#include "src/core/NEON/kernels/NEScharr3x3Kernel.h"
+#include "src/core/NEON/kernels/NESelectKernel.h"
+#include "src/core/NEON/kernels/NESobel3x3Kernel.h"
+#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "src/core/NEON/kernels/NESobel7x7Kernel.h"
+#include "src/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+#include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
+#include "src/core/NEON/kernels/NEStackLayerKernel.h"
+#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "src/core/NEON/kernels/NETableLookupKernel.h"
+#include "src/core/NEON/kernels/NEThresholdKernel.h"
+#include "src/core/NEON/kernels/NETileKernel.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
+#include "src/core/NEON/kernels/NEUpsampleLayerKernel.h"
+#include "src/core/NEON/kernels/NEWarpKernel.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "src/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
+#include "src/core/NEON/kernels/NEYOLOLayerKernel.h"
+
+#endif /* ARM_COMPUTE_NEKERNELS_H */
diff --git a/arm_compute/core/NEON/NEMath.h b/src/core/NEON/NEMath.h
similarity index 99%
rename from arm_compute/core/NEON/NEMath.h
rename to src/core/NEON/NEMath.h
index b82a9a341c..877ffb2827 100644
--- a/arm_compute/core/NEON/NEMath.h
+++ b/src/core/NEON/NEMath.h
@@ -303,5 +303,5 @@ float16x8_t vsinq_f16(float16x8_t val);
 
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
-#include "arm_compute/core/NEON/NEMath.inl"
+#include "src/core/NEON/NEMath.inl"
 #endif /* ARM_COMPUTE_NEMATH_H */
diff --git a/arm_compute/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl
similarity index 100%
rename from arm_compute/core/NEON/NEMath.inl
rename to src/core/NEON/NEMath.inl
diff --git a/arm_compute/core/NEON/NESymm.h b/src/core/NEON/NESymm.h
similarity index 99%
rename from arm_compute/core/NEON/NESymm.h
rename to src/core/NEON/NESymm.h
index 6dee8705f4..e6644577a1 100644
--- a/arm_compute/core/NEON/NESymm.h
+++ b/src/core/NEON/NESymm.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NESYMM_H
 #define ARM_COMPUTE_NESYMM_H
 
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/NEON/NEMath.h"
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/NETracePoint.cpp b/src/core/NEON/NETracePoint.cpp
index cb0dc1400a..bf48b411ec 100644
--- a/src/core/NEON/NETracePoint.cpp
+++ b/src/core/NEON/NETracePoint.cpp
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/core/TracePoint.h"
 
-#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/convolution/common/convolution.hpp"
+#include "src/core/NEON/kernels/NELKTrackerKernel.h"
+#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
+#include "src/core/NEON/kernels/convolution/common/convolution.hpp"
 #include "utils/TypePrinter.h"
 
 #include <memory>
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
index 3d4800fe15..a6a41b8af9 100644
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+#include "src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,6 +30,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
rename to src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
index 894e9277c7..cc95172f35 100644
--- a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H
 #define ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
index 7c85f698ae..46179cadcb 100644
--- a/src/core/NEON/kernels/NEAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+#include "src/core/NEON/kernels/NEAccumulateKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
-using namespace arm_compute;
-
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 /* Max S16 value used for saturation purposes. */
 const static uint16x8_t max_int_u16 = vdupq_n_u16(static_cast<uint16_t>(INT16_MAX));
 
@@ -359,3 +356,4 @@ void NEAccumulateSquaredKernel::run(const Window &window, const ThreadInfo &info
     },
     input, accum);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h b/src/core/NEON/kernels/NEAccumulateKernel.h
similarity index 61%
rename from arm_compute/core/NEON/kernels/NEAccumulateKernel.h
rename to src/core/NEON/kernels/NEAccumulateKernel.h
index 2e9935cd79..af1298f53f 100644
--- a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
+++ b/src/core/NEON/kernels/NEAccumulateKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEACCUMULATEKERNEL_H
 #define ARM_COMPUTE_NEACCUMULATEKERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 #include <cstdint>
 
@@ -44,6 +44,18 @@ class NEAccumulateKernel : public INESimpleKernel
     {
         return "NEAccumulateKernel";
     }
+    /** Default constructor */
+    NEAccumulateKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateKernel(const NEAccumulateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateKernel &operator=(const NEAccumulateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEAccumulateKernel(NEAccumulateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEAccumulateKernel &operator=(NEAccumulateKernel &&) = default;
+    /** Default destructor */
+    ~NEAccumulateKernel() = default;
     /** Set the input and accumulation tensors
      *
      * @param[in]  input Source tensor. Data type supported: U8.
@@ -73,6 +85,16 @@ class NEAccumulateWeightedKernel : public INESimpleKernel
     }
     /** Default constructor */
     NEAccumulateWeightedKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateWeightedKernel(const NEAccumulateWeightedKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateWeightedKernel &operator=(const NEAccumulateWeightedKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEAccumulateWeightedKernel(NEAccumulateWeightedKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEAccumulateWeightedKernel &operator=(NEAccumulateWeightedKernel &&) = default;
+    /** Default destructor */
+    ~NEAccumulateWeightedKernel() = default;
     /** Set the input and accumulation tensors, and the scale value
      *
      * @param[in]     input Source tensor. Data type supported: U8.
@@ -97,6 +119,18 @@ class NEAccumulateWeightedFP16Kernel : public NEAccumulateWeightedKernel
     {
         return "NEAccumulateWeightedFP16Kernel";
     }
+    /** Default constructor */
+    NEAccumulateWeightedFP16Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateWeightedFP16Kernel(const NEAccumulateWeightedFP16Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateWeightedFP16Kernel &operator=(const NEAccumulateWeightedFP16Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEAccumulateWeightedFP16Kernel(NEAccumulateWeightedFP16Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEAccumulateWeightedFP16Kernel &operator=(NEAccumulateWeightedFP16Kernel &&) = default;
+    /** Default destructor */
+    ~NEAccumulateWeightedFP16Kernel() = default;
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 };
@@ -121,6 +155,16 @@ class NEAccumulateSquaredKernel : public INESimpleKernel
     }
     /** Default constructor */
     NEAccumulateSquaredKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateSquaredKernel(const NEAccumulateSquaredKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAccumulateSquaredKernel &operator=(const NEAccumulateSquaredKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEAccumulateSquaredKernel(NEAccumulateSquaredKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEAccumulateSquaredKernel &operator=(NEAccumulateSquaredKernel &&) = default;
+    /** Default destructor */
+    ~NEAccumulateSquaredKernel() = default;
     /** Set the input and accumulation tensors and the shift value.
      *
      * @param[in]     input Source tensor. Data type supported: U8.
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index b15df311cc..51257cb96b 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -21,30 +21,88 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "src/core/NEON/kernels/NEActivationLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NESymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "src/core/NEON/kernels/activation/impl/list.h"
+#include "src/core/common/Registrars.h"
 
-#include <arm_neon.h>
 #include <set>
 
 namespace arm_compute
 {
 namespace
 {
+struct ActivationSelectorData
+{
+    DataType dt;
+};
+
+using ActivationSelectorPtr = std::add_pointer<bool(const ActivationSelectorData &data)>::type;
+using ActivationKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
+
+struct ActivationKernel
+{
+    const char                 *name;
+    const ActivationSelectorPtr is_selected;
+    ActivationKernelPtr         ukernel;
+};
+
+static const ActivationKernel available_kernels[] =
+{
+    {
+        "fp16_neon_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_activation)
+    },
+    {
+        "fp32_neon_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation)
+    },
+    {
+        "qasymm8_neon_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_activation)
+    },
+    {
+        "qasymm8_signed_neon_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_activation)
+    },
+    {
+        "qsymm16_neon_activation",
+        [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
+        REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation)
+    },
+};
+
+const ActivationKernel *get_implementation(const ActivationSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
 
+    const auto *uk = get_implementation(ActivationSelectorData{ input->data_type() });
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
     const static std::set<ActivationLayerInfo::ActivationFunction> qasymm8_supported_activations =
     {
         ActivationLayerInfo::ActivationFunction::RELU,
@@ -111,7 +169,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
 } // namespace
 
 NEActivationLayerKernel::NEActivationLayerKernel()
-    : _func(nullptr), _act_info()
+    : _act_info()
 {
 }
 
@@ -121,726 +179,14 @@ void NEActivationLayerKernel::configure(const ITensorInfo *input, ITensorInfo *o
 
     _act_info = activation_info;
 
-    // Disabled activation, thus no operation needed
-    if(!activation_info.enabled())
-    {
-        _func = nullptr;
-    }
-
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, activation_info));
 
-    // Activation functions : FP32
-    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
-    {
-        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, float> },
-        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, float> },
-        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, float> },
-        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, float> },
-        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, float> },
-        { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, float> },
-        { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LEAKY_RELU, float> },
-        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, float> },
-        { ActivationFunction::ELU, &NEActivationLayerKernel::activation<ActivationFunction::ELU, float> },
-        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, float> },
-        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, float> },
-        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float> },
-        { ActivationFunction::IDENTITY, &NEActivationLayerKernel::activation<ActivationFunction::IDENTITY, float> },
-        { ActivationFunction::HARD_SWISH, &NEActivationLayerKernel::activation<ActivationFunction::HARD_SWISH, float> },
-
-    };
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    // Activation functions : FP16
-    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f16 =
-    {
-        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, float16_t> },
-        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, float16_t> },
-        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, float16_t> },
-        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, float16_t> },
-        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, float16_t> },
-        { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, float16_t> },
-        { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LEAKY_RELU, float16_t> },
-        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, float16_t> },
-        { ActivationFunction::ELU, &NEActivationLayerKernel::activation<ActivationFunction::ELU, float16_t> },
-        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, float16_t> },
-        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, float16_t> },
-        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float16_t> },
-        { ActivationFunction::IDENTITY, &NEActivationLayerKernel::activation<ActivationFunction::IDENTITY, float16_t> },
-        { ActivationFunction::HARD_SWISH, &NEActivationLayerKernel::activation<ActivationFunction::HARD_SWISH, float16_t> },
-
-    };
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
-
-    // Activation functions : QASYMM8_SIGNED
-    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8_signed =
-    {
-        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qasymm8_signed_t> },
-        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qasymm8_signed_t> },
-        { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qasymm8_signed_t> },
-        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qasymm8_signed_t> },
-        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qasymm8_signed_t> },
-        { ActivationFunction::IDENTITY, &NEActivationLayerKernel::activation<ActivationFunction::IDENTITY, qasymm8_signed_t> },
-        { ActivationFunction::HARD_SWISH, &NEActivationLayerKernel::activation<ActivationFunction::HARD_SWISH, qasymm8_signed_t> },
-
-    };
-
-    // Activation functions : QASYMM8
-    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8 =
-    {
-        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qasymm8_t> },
-        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qasymm8_t> },
-        { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qasymm8_t> },
-        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qasymm8_t> },
-        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qasymm8_t> },
-        { ActivationFunction::IDENTITY, &NEActivationLayerKernel::activation<ActivationFunction::IDENTITY, qasymm8_t> },
-        { ActivationFunction::HARD_SWISH, &NEActivationLayerKernel::activation<ActivationFunction::HARD_SWISH, qasymm8_t> },
-
-    };
-
-    // Activation functions : QSYMM16
-    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qsymm16 =
-    {
-        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qsymm16_t> },
-        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qsymm16_t> },
-
-    };
-
-    switch(input->data_type())
-    {
-        case DataType::QASYMM8_SIGNED:
-            _func = act_map_qasymm8_signed[activation_info.activation()];
-            break;
-        case DataType::QASYMM8:
-            _func = act_map_qasymm8[activation_info.activation()];
-            break;
-        case DataType::QSYMM16:
-            _func = act_map_qsymm16[activation_info.activation()];
-            break;
-        case DataType::F32:
-            _func = act_map_f32[activation_info.activation()];
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = act_map_f16[activation_info.activation()];
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
-
     // Configure kernel window
     auto win_config = validate_and_configure_window(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICPPKernel::configure(win_config.second);
 }
 
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-NEActivationLayerKernel::activation(const ITensor *src, ITensor *dst, const Window &window)
-{
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    const int                window_step_x  = 16 / sizeof(T);
-    const auto               window_start_x = static_cast<int>(window.x().start());
-    const auto               window_end_x   = static_cast<int>(window.x().end());
-    const ActivationFunction act            = F;
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    // A small delta added to the input to prevent NAN values caused by zeros in inputs to SQRT
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    const auto delta = wrapper::vdup_n(static_cast<T>(1e-7), ExactTagType {});
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    const auto delta = wrapper::vdup_n(static_cast<T>(1e-24), ExactTagType {});
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    const auto const_1     = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType {});
-    const auto const_0     = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-    const auto const_6     = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{});
-    const auto const_3     = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{});
-    const auto const_inv_6 = wrapper::vdup_n(static_cast<T>(0.166666667f), ExactTagType{});
-
-    const auto va = wrapper::vdup_n(static_cast<T>(_act_info.a()), ExactTagType{});
-    const auto vb = wrapper::vdup_n(static_cast<T>(_act_info.b()), ExactTagType{});
-    const auto a  = static_cast<T>(_act_info.a());
-    const auto b  = static_cast<T>(_act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            switch(act)
-            {
-                case ActivationFunction::ABS:
-                    tmp = wrapper::vabs(vin);
-                    break;
-                case ActivationFunction::LINEAR:
-                    tmp = wrapper::vmla(vb, va, vin);
-                    break;
-                case ActivationFunction::LOGISTIC:
-                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
-                    break;
-                case ActivationFunction::RELU:
-                    tmp = wrapper::vmax(const_0, vin);
-                    break;
-                case ActivationFunction::BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
-                    break;
-                case ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
-                    break;
-                case ActivationFunction::LEAKY_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
-                    break;
-                case ActivationFunction::SOFT_RELU:
-                    tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)));
-                    break;
-                case ActivationFunction::ELU:
-                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
-                    break;
-                case ActivationFunction::SQRT:
-                    tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, delta)));
-                    break;
-                case ActivationFunction::SQUARE:
-                    tmp = wrapper::vmul(vin, vin);
-                    break;
-                case ActivationFunction::TANH:
-                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
-                    break;
-                case ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationFunction::HARD_SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const T in = *(reinterpret_cast<const T *>(input_ptr + x));
-            T       tmp;
-            switch(act)
-            {
-                case ActivationFunction::ABS:
-                    tmp = std::abs(in);
-                    break;
-                case ActivationFunction::LINEAR:
-                    tmp = a * in + b;
-                    break;
-                case ActivationFunction::LOGISTIC:
-                    tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
-                    break;
-                case ActivationFunction::RELU:
-                    tmp = std::max<T>(static_cast<T>(0), in);
-                    break;
-                case ActivationFunction::BOUNDED_RELU:
-                    tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
-                    break;
-                case ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = std::min<T>(a, std::max<T>(b, in));
-                    break;
-                case ActivationFunction::LEAKY_RELU:
-                    tmp = (in > 0) ? in : a * in;
-                    break;
-                case ActivationFunction::SOFT_RELU:
-                    tmp = std::log(static_cast<T>(1) + std::exp(in));
-                    break;
-                case ActivationFunction::ELU:
-                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
-                    break;
-                case ActivationFunction::SQRT:
-                    tmp = std::sqrt(in);
-                    break;
-                case ActivationFunction::SQUARE:
-                    tmp = in * in;
-                    break;
-                case ActivationFunction::TANH:
-                    tmp = a * std::tanh(b * in);
-                    break;
-                case ActivationFunction::IDENTITY:
-                    tmp = in;
-                    break;
-                case ActivationFunction::HARD_SWISH:
-                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivationLayerKernel::activation(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const int                window_step_x  = 16 / sizeof(T);
-    const auto               window_start_x = static_cast<int>(window.x().start());
-    const auto               window_end_x   = static_cast<int>(window.x().end());
-    const ActivationFunction act            = F;
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
-    const qasymm8x16_t            va              = vdupq_n_u8(quantize_qasymm8(_act_info.a(), qi_in));
-    const qasymm8x16_t            vb              = vdupq_n_u8(quantize_qasymm8(_act_info.b(), qi_in));
-    const qasymm8_t               a               = quantize_qasymm8(_act_info.a(), qi_in);
-    const qasymm8_t               b               = quantize_qasymm8(_act_info.b(), qi_in);
-    const qasymm8_t               const_0         = quantize_qasymm8(0.f, qi_in);
-    const qasymm8x16_t            vconst_0        = vdupq_n_u8(const_0);
-    const auto                    vconst_1        = vdupq_n_f32(1.f);
-    const float32x4_t             va_f32          = vdupq_n_f32(_act_info.a());
-    const float32x4_t             vb_f32          = vdupq_n_f32(_act_info.b());
-    const float                   a_f32           = _act_info.a();
-    const float                   b_f32           = _act_info.b();
-    const auto                    const_6_f32     = vdupq_n_f32(6.f);
-    const auto                    const_0_f32     = vdupq_n_f32(0.f);
-    const auto                    const_3_f32     = vdupq_n_f32(3.f);
-    const auto                    const_inv_6_f32 = vdupq_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    float       s  = qi_in.scale / qi_out.scale;
-    float       o  = -qi_in.offset * s + qi_out.offset;
-    float32x4_t vs = vdupq_n_f32(s);
-    float32x4_t vo = vdupq_n_f32(o);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = vmaxq_u8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
-            }
-            else if(act == ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
-            }
-            else if(act == ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
-            }
-            else if(act == ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            T in = *(reinterpret_cast<const T *>(input_ptr + x));
-            T tmp;
-            if(act == ActivationFunction::RELU)
-            {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
-            }
-            else if(act == ActivationFunction::BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
-            }
-            else if(act == ActivationFunction::LU_BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
-            }
-            else if(act == ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, qasymm8_signed_t>::value, void>::type NEActivationLayerKernel::activation(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const int                window_step_x  = 16 / sizeof(T);
-    const auto               window_start_x = static_cast<int>(window.x().start());
-    const auto               window_end_x   = static_cast<int>(window.x().end());
-    const ActivationFunction act            = F;
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
-    const qasymm8x16_signed_t     va              = vdupq_n_s8(quantize_qasymm8_signed(_act_info.a(), qi_in));
-    const qasymm8x16_signed_t     vb              = vdupq_n_s8(quantize_qasymm8_signed(_act_info.b(), qi_in));
-    const qasymm8_signed_t        a               = quantize_qasymm8_signed(_act_info.a(), qi_in);
-    const qasymm8_signed_t        b               = quantize_qasymm8_signed(_act_info.b(), qi_in);
-    const qasymm8_signed_t        const_0         = quantize_qasymm8_signed(0.f, qi_in);
-    const qasymm8x16_signed_t     vconst_0        = vdupq_n_s8(const_0);
-    const auto                    vconst_1        = vdupq_n_f32(1.f);
-    const float32x4_t             va_f32          = vdupq_n_f32(_act_info.a());
-    const float32x4_t             vb_f32          = vdupq_n_f32(_act_info.b());
-    const float                   a_f32           = _act_info.a();
-    const float                   b_f32           = _act_info.b();
-    const auto                    const_6_f32     = vdupq_n_f32(6.f);
-    const auto                    const_0_f32     = vdupq_n_f32(0.f);
-    const auto                    const_3_f32     = vdupq_n_f32(3.f);
-    const auto                    const_inv_6_f32 = vdupq_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    float       s  = qi_in.scale / qi_out.scale;
-    float       o  = -qi_in.offset * s + qi_out.offset;
-    float32x4_t vs = vdupq_n_f32(s);
-    float32x4_t vo = vdupq_n_f32(o);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = vmaxq_s8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
-            }
-            else if(act == ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
-            }
-            else if(act == ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
-            }
-            else if(act == ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            T in = *(reinterpret_cast<const T *>(input_ptr + x));
-            T tmp;
-            if(act == ActivationFunction::RELU)
-            {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
-            }
-            else if(act == ActivationFunction::BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
-            }
-            else if(act == ActivationFunction::LU_BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
-            }
-            else if(act == ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type NEActivationLayerKernel::activation(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const int                window_step_x  = 16 / sizeof(T);
-    const auto               window_start_x = static_cast<int>(window.x().start());
-    const auto               window_end_x   = static_cast<int>(window.x().end());
-    const ActivationFunction act            = F;
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const auto                    vconst_1 = vdupq_n_f32(1.f);
-    const float32x4_t             va_f32   = vdupq_n_f32(_act_info.a());
-    const float32x4_t             vb_f32   = vdupq_n_f32(_act_info.b());
-    const float                   a_f32    = _act_info.a();
-    const float                   b_f32    = _act_info.b();
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
-        ARM_COMPUTE_UNUSED(tmp);
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
-                {
-                    {
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            T in = *(reinterpret_cast<const T *>(input_ptr + x));
-            T tmp;
-            if(act == ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else if(act == ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-
 Status NEActivationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
@@ -861,12 +207,14 @@ void NEActivationLayerKernel::run_op(ITensorPack &tensors, const Window &window,
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
     ARM_COMPUTE_ERROR_ON(tensors.empty());
 
-    (this->*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
-                   tensors.get_tensor(TensorType::ACL_DST),
-                   window);
+    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    const auto *uk = get_implementation(ActivationSelectorData{ src->info()->data_type() });
+
+    uk->ukernel(src, dst, _act_info, window);
 }
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/src/core/NEON/kernels/NEActivationLayerKernel.h
similarity index 66%
rename from arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
rename to src/core/NEON/kernels/NEActivationLayerKernel.h
index 325647bd66..783783c6ab 100644
--- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H
 #define ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/INEKernel.h"
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include <arm_fp16.h>
@@ -54,6 +54,8 @@ class NEActivationLayerKernel : public INEKernel
     NEActivationLayerKernel &operator=(const NEActivationLayerKernel &) = delete;
     /** Default move assignment operator */
     NEActivationLayerKernel &operator=(NEActivationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEActivationLayerKernel() = default;
     /** Set the input and output tensor.
      *
      * @note If the output tensor is a nullptr, the activation function will be performed in-place
@@ -79,41 +81,7 @@ class NEActivationLayerKernel : public INEKernel
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 
 private:
-    using ActivationFunction = ActivationLayerInfo::ActivationFunction;
-    /** Common signature for all the specialised @ref NEActivationLayerKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ActivationFunctionExecutorPtr = void (NEActivationLayerKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-    activation(const ITensor *src, ITensor *dst, const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type activation(const ITensor *src, ITensor *dst, const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qasymm8_signed_t>::value, void>::type activation(const ITensor *src, ITensor *dst, const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type activation(const ITensor *src, ITensor *dst, const Window &window);
-
-private:
-    ActivationFunctionExecutorPtr _func;
-    ActivationLayerInfo           _act_info;
+    ActivationLayerInfo _act_info;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index 5f5a3e5b37..aa7af54e9c 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "src/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <map>
 #include <string>
@@ -54,7 +56,7 @@ void add_same(const ITensor *in1, const ITensor *in2, ITensor *out, const Conver
     constexpr int window_step_x         = 16 / sizeof(T);
     const auto    window_start_x        = static_cast<int>(window.x().start());
     const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool    is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     if(is_broadcast_across_x)
     {
@@ -150,7 +152,7 @@ void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor
     const int  window_step_x         = 16;
     const auto window_start_x        = static_cast<int>(window.x().start());
     const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
     const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
@@ -343,7 +345,7 @@ void add_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED(const ITensor *in1, const
     const int  window_step_x         = 16;
     const auto window_start_x        = static_cast<int>(window.x().start());
     const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
     const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
@@ -535,7 +537,7 @@ void add_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor
     const int  window_step_x         = 8;
     const auto window_start_x        = static_cast<int>(window.x().start());
     const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
     const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/src/core/NEON/kernels/NEArithmeticAdditionKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
rename to src/core/NEON/kernels/NEArithmeticAdditionKernel.h
index eece5708e8..2072ad91bd 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H
 #define ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index 92371936fa..187e97dd49 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "src/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NESymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
@@ -65,7 +67,7 @@ void sub_same(const ITensor *in1, const ITensor *in2, ITensor *out, const Window
     constexpr int window_step_x         = 16 / sizeof(T);
     const auto    window_start_x        = static_cast<int>(window.x().start());
     const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool    is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
@@ -176,7 +178,7 @@ void sub_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const W
     const int  window_step_x         = 16;
     const auto window_start_x        = static_cast<int>(window.x().start());
     const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
     const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
@@ -370,7 +372,7 @@ void sub_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor
     const int  window_step_x         = 8;
     const auto window_start_x        = static_cast<int>(window.x().start());
     const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
     const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
@@ -669,9 +671,12 @@ inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &i
 {
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
+                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
+                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
+                                                         DataType::F32);
 
     const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
@@ -685,15 +690,16 @@ inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &i
         && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16)
         && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8)
         && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16)
+        && !(input1.data_type() == DataType::S32 && input2.data_type() == DataType::S32)
         && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32)
         && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16),
         "You called subtract with the wrong image formats");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP
-        && input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP
-        && input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP,
-        "Convert policy cannot be WRAP if datatype is QASYMM8 or QASYMM8_SIGNED");
+        (input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP)
+        || (input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP)
+        || (input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP),
+        "Convert policy cannot be WRAP if datatype is quantized");
 
     // Validate in case of configured output
     if(output.total_size() > 0)
@@ -707,6 +713,7 @@ inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &i
             && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
             && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
             && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
+            && !(input1.data_type() == DataType::S32 && input2.data_type() == DataType::S32 && output.data_type() == DataType::S32)
             && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
             && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
             "You called subtract with the wrong image formats");
@@ -776,6 +783,10 @@ void NEArithmeticSubtractionKernel::configure(const ITensorInfo *input1, const I
             _func = &sub_QSYMM16_QSYMM16_QSYMM16;
             set_data_type_if_unknown(*output, DataType::QSYMM16);
             break;
+        case DataType::S32:
+            _func = &sub_same<int32_t>;
+            set_format_if_unknown(*output, Format::S32);
+            break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
             _func = &sub_same<float16_t>;
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.h
similarity index 89%
rename from arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
rename to src/core/NEON/kernels/NEArithmeticSubtractionKernel.h
index e3a41a2b1c..69952d6162 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H
 #define ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -63,12 +63,13 @@ class NEArithmeticSubtractionKernel : public INEKernel
      *   - (S16,U8)                         -> S16
      *   - (U8,S16)                         -> S16
      *   - (S16,S16)                        -> S16
+     *   - (S32,S32)                        -> S32
      *   - (F16,F16)                        -> F16
      *   - (F32,F32)                        -> F32
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32.
+     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
      * @param[in]  policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized.
      */
     void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy);
@@ -83,14 +84,13 @@ class NEArithmeticSubtractionKernel : public INEKernel
      *   - (S16,U8)                         -> S16
      *   - (U8,S16)                         -> S16
      *   - (S16,S16)                        -> S16
+     *   - (S32,S32)                        -> S32
      *   - (F16,F16)                        -> F16
      *   - (F32,F32)                        -> F32
      *
-     * @note Convert policy cannot be WRAP if datatype is QASYMM8
-     *
-     * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in] input2 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32.
+     * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in] input2 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
      * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      *
      * @return a status
@@ -103,9 +103,9 @@ class NEArithmeticSubtractionKernel : public INEKernel
 private:
     /** Common signature for all the specialised sub functions
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32.
+     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
      * @param[in]  window Region on which to execute the kernel.
      * @param[in]  is_sat Flag to indicate if the policy is SATURATE.
      */
diff --git a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
index 0ee6d0efcf..ddf69710f9 100644
--- a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
@@ -21,17 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
rename to src/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
index 478890925b..b74a94805d 100644
--- a/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
@@ -25,7 +25,7 @@
 #ifndef ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H
 #define ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 0651cf28e6..afb08e5d1c 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -21,19 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <map>
 
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
rename to src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 962d2565c0..9312073ce8 100644
--- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
index eb28ce0a8b..10207b9cf6 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
+#include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute::misc::shape_calculator;
 
diff --git a/arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
rename to src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
index 943577d879..26e8224922 100644
--- a/arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
 #define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index fa8332e803..4f4de70c3c 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h b/src/core/NEON/kernels/NEBitwiseAndKernel.h
similarity index 95%
rename from arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
rename to src/core/NEON/kernels/NEBitwiseAndKernel.h
index 0e4c886d34..e4603f68f6 100644
--- a/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEBITWISEANDKERNEL_H
 #define ARM_COMPUTE_NEBITWISEANDKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -52,6 +52,8 @@ class NEBitwiseAndKernel : public INEKernel
     NEBitwiseAndKernel(NEBitwiseAndKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEBitwiseAndKernel &operator=(NEBitwiseAndKernel &&) = default;
+    /** Default destructor */
+    ~NEBitwiseAndKernel() = default;
     /** Initialise the kernel's inputs and output
      *
      * @param[in]  input1 An input tensor. Data type supported: U8.
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 4da07f93b0..c69c4ea046 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h b/src/core/NEON/kernels/NEBitwiseNotKernel.h
similarity index 94%
rename from arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
rename to src/core/NEON/kernels/NEBitwiseNotKernel.h
index a20fdaec93..ba47c38143 100644
--- a/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEBITWISENOTKERNEL_H
 #define ARM_COMPUTE_NEBITWISENOTKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -52,6 +52,8 @@ class NEBitwiseNotKernel : public INEKernel
     NEBitwiseNotKernel(NEBitwiseNotKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEBitwiseNotKernel &operator=(NEBitwiseNotKernel &&) = default;
+    /** Default destructor */
+    ~NEBitwiseNotKernel() = default;
     /** Initialise the kernel's input and output
      *
      * @param[in]  input  An input tensor. Data type supported: U8.
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index 591acf50e1..875e6391a5 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h b/src/core/NEON/kernels/NEBitwiseOrKernel.h
similarity index 95%
rename from arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
rename to src/core/NEON/kernels/NEBitwiseOrKernel.h
index 70db5fbeb6..40ef757d60 100644
--- a/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEBITWISEORKERNEL_H
 #define ARM_COMPUTE_NEBITWISEORKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -52,6 +52,8 @@ class NEBitwiseOrKernel : public INEKernel
     NEBitwiseOrKernel(NEBitwiseOrKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEBitwiseOrKernel &operator=(NEBitwiseOrKernel &&) = default;
+    /** Default destructor */
+    ~NEBitwiseOrKernel() = default;
     /** Initialise the kernel's inputs and output.
      *
      * @param[in]  input1 An input tensor. Data type supported: U8.
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index b0aec4078f..603b49d5eb 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h b/src/core/NEON/kernels/NEBitwiseXorKernel.h
similarity index 95%
rename from arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
rename to src/core/NEON/kernels/NEBitwiseXorKernel.h
index 91f24f1c82..24d07a6e18 100644
--- a/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEBITWISEXORKERNEL_H
 #define ARM_COMPUTE_NEBITWISEXORKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -52,6 +52,8 @@ class NEBitwiseXorKernel : public INEKernel
     NEBitwiseXorKernel(NEBitwiseXorKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEBitwiseXorKernel &operator=(NEBitwiseXorKernel &&) = default;
+    /** Default destructor */
+    ~NEBitwiseXorKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @param[in]  input1 An input tensor. Data type supported: U8.
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index 56444dcbc0..03d6e1c600 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
+#include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h
rename to src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
index 8b3953a53a..c080ce6a5c 100644
--- a/arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEBOUNDINGBOXTRANSFORMKERNEL_H
 #define ARM_COMPUTE_NEBOUNDINGBOXTRANSFORMKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
index d5d03a9def..2aa8aa8e99 100644
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
+#include "src/core/NEON/kernels/NEBox3x3Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <arm_neon.h>
 
 using namespace arm_compute;
diff --git a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h b/src/core/NEON/kernels/NEBox3x3Kernel.h
similarity index 65%
rename from arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
rename to src/core/NEON/kernels/NEBox3x3Kernel.h
index 32e991e217..f6a64a7bb4 100644
--- a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEBOX3x3KERNEL_H
 #define ARM_COMPUTE_NEBOX3x3KERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -38,6 +38,18 @@ class NEBox3x3Kernel : public INESimpleKernel
     {
         return "NEBox3x3Kernel";
     }
+    /** Default constructor */
+    NEBox3x3Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBox3x3Kernel(const NEBox3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBox3x3Kernel &operator=(const NEBox3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBox3x3Kernel(NEBox3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBox3x3Kernel &operator=(NEBox3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NEBox3x3Kernel() = default;
     /** Set the source, destination and border mode of the kernel
      *
      * @param[in]  input            Source tensor. Data type supported: U8.
@@ -60,6 +72,18 @@ class NEBox3x3FP16Kernel : public NEBox3x3Kernel
     {
         return "NEBox3x3FP16Kernel";
     }
+    /** Default constructor */
+    NEBox3x3FP16Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBox3x3FP16Kernel(const NEBox3x3FP16Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBox3x3FP16Kernel &operator=(const NEBox3x3FP16Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBox3x3FP16Kernel(NEBox3x3FP16Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBox3x3FP16Kernel &operator=(NEBox3x3FP16Kernel &&) = default;
+    /** Default destructor */
+    ~NEBox3x3FP16Kernel() = default;
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 };
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index 0278bb08e1..7a2bf20c04 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,9 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "src/core/NEON/kernels/NECannyEdgeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -31,28 +30,25 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>
 #include <tuple>
 
-using namespace arm_compute;
-
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
 constexpr int NO_EDGE = 0;
 constexpr int EDGE    = 255;
 constexpr int MAYBE   = 127;
-} // namespace
 
-namespace
-{
 inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
 {
     // Constant use for evaluating score1 and score3
@@ -869,6 +865,8 @@ void edge_trace_U8_U8(uint8_t *__restrict input, uint8_t *__restrict output, con
 }
 } // namespace
 
+NEGradientKernel::~NEGradientKernel() = default;
+
 NEGradientKernel::NEGradientKernel()
     : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
 {
@@ -957,6 +955,7 @@ void NEGradientKernel::run(const Window &window, const ThreadInfo &info)
     gx, gy, magnitude, phase);
 }
 
+NEEdgeNonMaxSuppressionKernel::~NEEdgeNonMaxSuppressionKernel() = default;
 NEEdgeNonMaxSuppressionKernel::NEEdgeNonMaxSuppressionKernel()
     : _func(nullptr), _magnitude(nullptr), _phase(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0)
 {
@@ -1041,6 +1040,7 @@ void NEEdgeNonMaxSuppressionKernel::run(const Window &window, const ThreadInfo &
     magnitude, phase, output);
 }
 
+NEEdgeTraceKernel::~NEEdgeTraceKernel() = default;
 NEEdgeTraceKernel::NEEdgeTraceKernel()
     : _input(nullptr), _output(nullptr)
 {
@@ -1119,3 +1119,4 @@ void NEEdgeTraceKernel::run(const Window &window, const ThreadInfo &info)
     },
     input, output);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/src/core/NEON/kernels/NECannyEdgeKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
rename to src/core/NEON/kernels/NECannyEdgeKernel.h
index c4e1f3ec3a..eff735259d 100644
--- a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NECANNYEDGEKERNEL_H
 #define ARM_COMPUTE_NECANNYEDGEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
 
@@ -51,7 +51,7 @@ class NEGradientKernel : public INEKernel
     /** Allow instances of this class to be moved */
     NEGradientKernel &operator=(NEGradientKernel &&) = default;
     /** Default destructor */
-    virtual ~NEGradientKernel() = default;
+    ~NEGradientKernel();
 
     /** Initialise the kernel's sources, destinations and border mode.
      *
@@ -110,7 +110,7 @@ class NEEdgeNonMaxSuppressionKernel : public INEKernel
     /** Allow instances of this class to be moved */
     NEEdgeNonMaxSuppressionKernel &operator=(NEEdgeNonMaxSuppressionKernel &&) = default;
     /** Default destructor */
-    ~NEEdgeNonMaxSuppressionKernel() = default;
+    ~NEEdgeNonMaxSuppressionKernel();
 
     /** Initialise the kernel's sources, destination and border mode.
      *
@@ -166,8 +166,8 @@ class NEEdgeTraceKernel : public INEKernel
     NEEdgeTraceKernel(NEEdgeTraceKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEEdgeTraceKernel &operator=(NEEdgeTraceKernel &&) = default;
-    /** Default constructor */
-    ~NEEdgeTraceKernel() = default;
+    /** Default destructor */
+    ~NEEdgeTraceKernel();
 
     /** Initialise the kernel's source, destination and border mode.
      *
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index 0de6c4326a..6bfd4c5bda 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+#include "src/core/NEON/kernels/NEChannelCombineKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h b/src/core/NEON/kernels/NEChannelCombineKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
rename to src/core/NEON/kernels/NEChannelCombineKernel.h
index 5d32aed573..a3372be4d2 100644
--- a/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H
 #define ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <array>
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
index 800c63606f..d0d1c6852f 100644
--- a/src/core/NEON/kernels/NEChannelExtractKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+#include "src/core/NEON/kernels/NEChannelExtractKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -29,11 +29,13 @@
 #include "arm_compute/core/IMultiImage.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h b/src/core/NEON/kernels/NEChannelExtractKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
rename to src/core/NEON/kernels/NEChannelExtractKernel.h
index debae2488f..0b2847d79c 100644
--- a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H
 #define ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index 88cd0ae514..6e16f24956 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,9 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
+#include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -31,6 +30,9 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h
rename to src/core/NEON/kernels/NEChannelShuffleLayerKernel.h
index e5bce7e273..c7d09df08e 100644
--- a/arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NECHANNELSHUFFLELAYERKERNEL_H
 #define ARM_COMPUTE_NECHANNELSHUFFLELAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index 6a07defd79..97b68d1321 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "src/core/NEON/kernels/NECol2ImKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NECol2ImKernel.h
rename to src/core/NEON/kernels/NECol2ImKernel.h
index e988771599..59d1d741b6 100644
--- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h
+++ b/src/core/NEON/kernels/NECol2ImKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NECOL2IMKERNEL_H
 #define ARM_COMPUTE_NECOL2IMKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include "arm_compute/core/Size2D.h"
 
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
index 1f0796519b..23270d42d1 100644
--- a/src/core/NEON/kernels/NEColorConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+#include "src/core/NEON/kernels/NEColorConvertKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -29,11 +29,14 @@
 #include "arm_compute/core/IMultiImage.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/NEON/NEColorConvertHelper.inl"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "src/core/NEON/kernels/detail/NEColorConvertHelper.inl"
 
 using namespace arm_compute;
 
diff --git a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h b/src/core/NEON/kernels/NEColorConvertKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEColorConvertKernel.h
rename to src/core/NEON/kernels/NEColorConvertKernel.h
index 88c03b7607..1adb624aae 100644
--- a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
+++ b/src/core/NEON/kernels/NEColorConvertKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_COLORCONVERTKERNEL_H
 #define ARM_COMPUTE_COLORCONVERTKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
index 97bb8ccb8a..597c283a9c 100644
--- a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
rename to src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
index dadf9e9b94..766ee8858a 100644
--- a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
+++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
 #define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
index d439f4314d..1f2170f42a 100644
--- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
+++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
similarity index 95%
rename from arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
rename to src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
index 6c74a1216c..2f80361ba5 100644
--- a/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
+++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H
 #define ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -50,6 +50,8 @@ class NEConvertQuantizedSignednessKernel : public INEKernel
     NEConvertQuantizedSignednessKernel(NEConvertQuantizedSignednessKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEConvertQuantizedSignednessKernel &operator=(NEConvertQuantizedSignednessKernel &&) = default;
+    /** Default destructor */
+    ~NEConvertQuantizedSignednessKernel() = default;
     /** Initialize the kernel's input, output.
      *
      * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
index 7103fa1618..bac27430f9 100644
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
+#include "src/core/NEON/kernels/NEConvolutionKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/arm_compute/core/NEON/kernels/NEConvolutionKernel.h b/src/core/NEON/kernels/NEConvolutionKernel.h
similarity index 84%
rename from arm_compute/core/NEON/kernels/NEConvolutionKernel.h
rename to src/core/NEON/kernels/NEConvolutionKernel.h
index 51a63335ff..b8bf1d169e 100644
--- a/arm_compute/core/NEON/kernels/NEConvolutionKernel.h
+++ b/src/core/NEON/kernels/NEConvolutionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL_H
 #define ARM_COMPUTE_NECONVOLUTIONKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 #include <array>
 #include <cstdint>
@@ -61,6 +61,16 @@ class NEConvolutionKernel : public INESimpleKernel
     }
     /** Default constructor */
     NEConvolutionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NEConvolutionKernel(const NEConvolutionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NEConvolutionKernel &operator=(const NEConvolutionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEConvolutionKernel(NEConvolutionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEConvolutionKernel &operator=(NEConvolutionKernel &&) = default;
+    /** Default destructor */
+    ~NEConvolutionKernel() = default;
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in]  input            Source tensor. Data type supported: U8.
@@ -108,6 +118,16 @@ class NESeparableConvolutionHorKernel : public INESimpleKernel
     }
     /** Default constructor */
     NESeparableConvolutionHorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NESeparableConvolutionHorKernel(const NESeparableConvolutionHorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NESeparableConvolutionHorKernel &operator=(const NESeparableConvolutionHorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESeparableConvolutionHorKernel(NESeparableConvolutionHorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESeparableConvolutionHorKernel &operator=(NESeparableConvolutionHorKernel &&) = default;
+    /** Default destructor */
+    ~NESeparableConvolutionHorKernel() = default;
 
     /** Initialise the kernel's input, output and border mode.
      *
@@ -152,6 +172,16 @@ class NESeparableConvolutionVertKernel : public INESimpleKernel
     }
     /** Default constructor */
     NESeparableConvolutionVertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NESeparableConvolutionVertKernel(const NESeparableConvolutionVertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NESeparableConvolutionVertKernel &operator=(const NESeparableConvolutionVertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESeparableConvolutionVertKernel(NESeparableConvolutionVertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESeparableConvolutionVertKernel &operator=(NESeparableConvolutionVertKernel &&) = default;
+    /** Default destructor */
+    ~NESeparableConvolutionVertKernel() = default;
 
     /** Initialise the kernel's input, output and border mode.
      *
@@ -226,6 +256,8 @@ class NEConvolutionRectangleKernel : public INEKernel
     NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &&) = default;
+    /** Default destructor */
+    ~NEConvolutionRectangleKernel() = default;
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in]  input            Source tensor. Data type supported: U8.
diff --git a/src/core/NEON/kernels/NECopyKernel.cpp b/src/core/NEON/kernels/NECopyKernel.cpp
index 3d00139263..337c44c8eb 100644
--- a/src/core/NEON/kernels/NECopyKernel.cpp
+++ b/src/core/NEON/kernels/NECopyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NECopyKernel.h b/src/core/NEON/kernels/NECopyKernel.h
similarity index 95%
rename from arm_compute/core/NEON/kernels/NECopyKernel.h
rename to src/core/NEON/kernels/NECopyKernel.h
index ddd14c18b8..62b7b803be 100644
--- a/arm_compute/core/NEON/kernels/NECopyKernel.h
+++ b/src/core/NEON/kernels/NECopyKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NECOPYKERNEL_H
 #define ARM_COMPUTE_NECOPYKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -49,6 +49,8 @@ class NECopyKernel : public INEKernel
     NECopyKernel(NECopyKernel &&) = default;
     /** Allow instances of this class to be moved */
     NECopyKernel &operator=(NECopyKernel &&) = default;
+    /** Default destructor */
+    ~NECopyKernel() = default;
     /** Initialize the kernel's input, output.
      *
      * @param[in]  input   Source tensor. Data types supported: All
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index 03bc9f0f75..c94cdaed22 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -21,19 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NECropKernel.h"
+#include "src/core/NEON/kernels/NECropKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/helpers/bit_ops.h"
+#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/bit_ops.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NECropKernel.h
rename to src/core/NEON/kernels/NECropKernel.h
index b7e185f550..742215e22b 100644
--- a/arm_compute/core/NEON/kernels/NECropKernel.h
+++ b/src/core/NEON/kernels/NECropKernel.h
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_NEON_CROP_KERNEL_H
 #define ARM_COMPUTE_NEON_CROP_KERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
index cec0e1ce60..58a9a2f1fb 100644
--- a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
+#include "src/core/NEON/kernels/NECumulativeDistributionKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,6 +30,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <cmath>
diff --git a/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h b/src/core/NEON/kernels/NECumulativeDistributionKernel.h
similarity index 96%
rename from arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
rename to src/core/NEON/kernels/NECumulativeDistributionKernel.h
index e4fe81a5d5..1f8c65b5fa 100644
--- a/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H
 #define ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
 
@@ -58,6 +58,8 @@ class NECumulativeDistributionKernel : public INEKernel
     NECumulativeDistributionKernel(NECumulativeDistributionKernel &&) = default;
     /** Allow instances of this class to be moved */
     NECumulativeDistributionKernel &operator=(NECumulativeDistributionKernel &&) = default;
+    /** Default destructor */
+    ~NECumulativeDistributionKernel() = default;
     /** Set the input and output distribution.
      *
      * @param[in]  input          Input image. Data type supported: U8
diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index 6926ec1aac..ba90bfcd4f 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
@@ -21,19 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstdint>
 
diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
rename to src/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
index 3b2b9a1b79..02c5479f93 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
@@ -25,7 +25,7 @@
 #ifndef ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H
 #define ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index 5df3e3ee7d..d6c89a4553 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -21,19 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/SaturateCast.h"
-
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/SaturateCast.h"
 
 using namespace arm_compute;
 
diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h b/src/core/NEON/kernels/NEDepthConvertLayerKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
rename to src/core/NEON/kernels/NEDepthConvertLayerKernel.h
index e297fd7d1b..30fe1ed2e6 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_DEPTHCONVERTKERNEL_H
 #define ARM_COMPUTE_DEPTHCONVERTKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -50,6 +50,8 @@ class NEDepthConvertLayerKernel : public INEKernel
     NEDepthConvertLayerKernel &operator=(const NEDepthConvertLayerKernel &) = delete;
     /** Default move assignment operator */
     NEDepthConvertLayerKernel &operator=(NEDepthConvertLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEDepthConvertLayerKernel() = default;
     /** Set the input and output of the kernel
      *
      * Valid conversions Input -> Output :
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index 618a1baf07..6dcc85ec2e 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <arm_neon.h>
 #include <cstdint>
 
@@ -71,7 +74,7 @@ NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
 void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    TensorShape output_shape = compute_depth_to_space_shape(input->info(), block_shape);
+    TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
diff --git a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
rename to src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
index c497b2c858..7e18dd88b8 100644
--- a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
 #define ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
deleted file mode 100644
index 134ebb0e41..0000000000
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-namespace arm_compute
-{
-namespace
-{
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_3x3
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_written_per_iteration,
-                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
-    {
-        const int input_offset   = -input->info()->quantization_info().uniform().offset;
-        const int weights_offset = -weights->info()->quantization_info().uniform().offset;
-
-        const int          input_stride_x  = input->info()->strides_in_bytes().x();
-        const int          input_stride_y  = input->info()->strides_in_bytes().y();
-        const int          input_stride_z  = input->info()->strides_in_bytes().z();
-        const int          input_stride_w  = input->info()->strides_in_bytes()[3];
-        const int          output_stride_y = output->info()->strides_in_bytes().y();
-        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          output_w        = output->info()->dimension(0);
-        const int          output_h        = output->info()->dimension(1);
-        const int          delta_input     = detail::get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_x      = conv_info.pad_left();
-        const unsigned int conv_pad_y      = conv_info.pad_top();
-
-        // setup output window for the iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
-        window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
-
-        // setup input window for the iterator
-        Window window_in = window;
-        // Iteration of input is taken care of in execute_window_loop
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
-        Iterator in(input, window_in);
-        Iterator out(output, window_out);
-        Iterator w(weights, window_k);
-
-        const uint8_t *weights_ptr = w.ptr();
-
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            int ih = 0;
-            int oh = 0;
-
-            const uint8_t *input_ptr        = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y + (id.z() / depth_multiplier) * input_stride_z + input_stride_w * id[3];
-            const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
-
-            const auto ptr_weights_r0 = reinterpret_cast<const T1 *>(ptr_weights_base);
-            const auto ptr_weights_r1 = reinterpret_cast<const T1 *>(ptr_weights_base + kernel_stride_y);
-            const auto ptr_weights_r2 = reinterpret_cast<const T1 *>(ptr_weights_base + kernel_stride_y * 2);
-            const auto vw_r0          = detail::load_matrix_row(ptr_weights_r0, weights_offset);
-            const auto vw_r1          = detail::load_matrix_row(ptr_weights_r1, weights_offset);
-            const auto vw_r2          = detail::load_matrix_row(ptr_weights_r2, weights_offset);
-
-            for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
-            {
-                auto in_top = reinterpret_cast<const T1 *>(input_ptr + (ih + 0) * input_stride_y);
-                auto in_mid = reinterpret_cast<const T1 *>(input_ptr + (ih + dilation.y()) * input_stride_y);
-                auto in_low = reinterpret_cast<const T1 *>(input_ptr + (ih + 2 * dilation.y()) * input_stride_y); // uint8/int8
-                auto p_out  = reinterpret_cast<T2 *>(out.ptr() + oh * output_stride_y);                           // int32
-
-                for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                    in_top += delta_input, in_mid += delta_input, in_low += delta_input,
-                    p_out += num_elems_written_per_iteration)
-                {
-                    if(dilation == Size2D(1U, 1U))
-                    {
-                        detail::convolve_3x3<false>(in_top, in_mid, in_low, p_out, vw_r0, vw_r1, vw_r2, stridex, input_offset);
-                    }
-                    else
-                    {
-                        auto vres = detail::convolve_3x3_dilation(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, dilation.x(), stridex, input_offset);
-                        detail::store_results<stridex>(p_out, vres);
-                    }
-                }
-            }
-        },
-        out);
-    }
-};
-
-template <typename T1, typename T2>
-inline void convolve_3x3(const Window &window, unsigned int num_elems_written_per_iteration,
-                         const ITensor *input, const ITensor *weights, ITensor *output,
-                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
-            break;
-        case 2:
-            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
-            break;
-        case 3:
-            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-
-    const DataLayout   data_layout = input->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != 3 || weights->dimension(height_idx) != 3);
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
-
-    if(output->total_size() != 0)
-    {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-
-        if(is_data_type_quantized_asymmetric(input->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        }
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                                        const Size2D &dilation)
-{
-    Window win;
-    bool   window_changed = false;
-
-    // Get convolved dimensions
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-    const DataType    output_dt    = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt).set_quantization_info(output->quantization_info()));
-
-    // Configure kernel window (generic)
-    const unsigned int conv_stride_x = conv_info.stride().first;
-    const unsigned int conv_stride_y = conv_info.stride().second;
-    const unsigned int conv_pad_top  = conv_info.pad_top();
-    const unsigned int conv_pad_left = conv_info.pad_left();
-
-    unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
-    unsigned int num_elems_read_per_iteration    = 0;
-
-    switch(input->data_type())
-    {
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-            num_elems_read_per_iteration = 16 + 15 * (dilation.x() - 1);
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            num_elems_written_per_iteration = 32 >> conv_stride_x;
-            num_elems_read_per_iteration    = 24 + 23 * (dilation.x() - 1);
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-            num_elems_read_per_iteration = 12 + 11 * (dilation.x() - 1);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported.");
-    }
-
-    // Configure kernel window
-    win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
-
-    AccessWindowRectangle  input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration, 3 + 2 * (dilation.y() - 1), conv_stride_x, conv_stride_y);
-    AccessWindowStatic     weights_access(weights, 0, 0, 3, 3);
-    AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
-
-    window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
-    : _border_size(0), _input(), _output(), _weights(), _conv_info(), _num_elems_written_per_iteration(0), _depth_multiplier(1), _dilation()
-{
-}
-
-BorderSize NEDepthwiseConvolutionLayer3x3Kernel::border_size() const
-{
-    return _border_size;
-}
-
-void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                                     const Size2D &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, dilation));
-
-    _input            = input;
-    _output           = output;
-    _weights          = weights;
-    _conv_info        = conv_info;
-    _depth_multiplier = depth_multiplier;
-    switch(input->info()->data_type())
-    {
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::F32:
-            _num_elems_written_per_iteration = 16 >> _conv_info.stride().first;
-            break;
-        case DataType::F16:
-            _num_elems_written_per_iteration = 32 >> _conv_info.stride().first;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported.");
-    }
-    _border_size    = BorderSize(_conv_info.pad_top(), _conv_info.pad_right(), _conv_info.pad_bottom(), _conv_info.pad_left());
-    _dilation       = dilation;
-    auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, dilation);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NEDepthwiseConvolutionLayer3x3Kernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                                      const Size2D &dilation)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info, depth_multiplier, dilation));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, dilation).first);
-    return Status{};
-}
-
-void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_UNUSED(info);
-
-    ARM_COMPUTE_UNUSED(info);
-
-    switch(_input->info()->data_type())
-    {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            convolve_3x3<float16_t, float16_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F32:
-            convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
-            break;
-        case DataType::QASYMM8:
-            convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
-            break;
-        case DataType::QASYMM8_SIGNED:
-            convolve_3x3<int8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
index 62b2531daf..87315909d8 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
@@ -21,61 +21,118 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/wrapper/traits.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-void pad_vectors(std::vector<int> &mult, std::vector<int> &shift, int vec_size)
+constexpr auto data_layout = DataLayout::NHWC;
+const size_t   width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+const size_t   height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+const size_t   channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+constexpr auto   dim_manual_loop      = Window::Dimension(0, 0, 0);
+constexpr auto   dim_single_unit_step = Window::Dimension(0, 1, 1);
+constexpr size_t vector_size          = 8;
+
+struct DepthwiseConvolutionRunInfo
 {
-    ARM_COMPUTE_ERROR_ON(mult.size() != shift.size());
-    while(mult.size() % vec_size != 0)
+    const size_t   num_read_elements_per_iteration;
+    const uint32_t x_start;
+    const uint32_t x_end;
+    const uint32_t x_step;
+    const uint32_t x_leftover_start;
+    const size_t   input_stride_y;
+    const size_t   input_stride_z;
+    const size_t   input_max_offset;
+    const size_t   weights_width;
+    const size_t   weights_height;
+    const size_t   weights_stride_y;
+    const size_t   weights_stride_z;
+    const size_t   conv_stride_x;
+    const size_t   conv_stride_y;
+    const size_t   conv_pad_left;
+    const size_t   conv_pad_top;
+    const size_t   input_height;
+    const size_t   input_width;
+    const size_t   input_depth;
+
+    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1)
+        : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
+          x_start(w.x().start()),
+          x_end(w.x().end()),
+          x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
+          x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
+          input_stride_y(input.strides_in_bytes().y()),
+          input_stride_z(input.strides_in_bytes().z()),
+          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
+          weights_width(weights.dimension(width_idx)),
+          weights_height(weights.dimension(height_idx)),
+          weights_stride_y(weights.strides_in_bytes().y()),
+          weights_stride_z(weights.strides_in_bytes().z()),
+          conv_stride_x(conv_info.stride().first),
+          conv_stride_y(conv_info.stride().second),
+          conv_pad_left(conv_info.pad_left()),
+          conv_pad_top(conv_info.pad_top()),
+          input_height(input.dimension(height_idx)),
+          input_width(input.dimension(width_idx)),
+          input_depth(input.dimension(channel_idx))
     {
-        mult.push_back(0);
-        shift.push_back(0);
     }
+};
+
+inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
+{
+    const int32_t current_h  = base_h + h * dilation.y();
+    const bool    is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
+
+    const int32_t current_w  = base_w + w * dilation.x();
+    const bool    is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
+
+    return is_valid_h && is_valid_w;
 }
 
-template <typename T, int S>
+template <typename T>
 void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
                                    const Size2D &dilation, const Window &window, bool has_biases)
 {
-    using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
-    using TagType    = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    const size_t input_stride_y   = input->info()->strides_in_bytes().y();
-    const size_t input_stride_z   = input->info()->strides_in_bytes().z();
-    const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
-                                    input->info()->strides_in_bytes().y();
-    const size_t weights_width    = weights->info()->dimension(1);
-    const size_t weights_height   = weights->info()->dimension(2);
-    const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
-    const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
-    const size_t conv_stride_x    = conv_info.stride().first;
-    const size_t conv_stride_y    = conv_info.stride().second;
-    const size_t conv_pad_left    = conv_info.pad_left();
-    const size_t conv_pad_top     = conv_info.pad_top();
+    constexpr auto element_per_vector = vector_size / sizeof(T);
+    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
+    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
+
+    const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, dim_single_unit_step);
 
     Window win_input = window;
-    win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    win_input.set(Window::DimX, dim_manual_loop);
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
 
     Window win_weights = win_input;
-    win_weights.set(3, Window::Dimension(0, 0, 0));
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set(Window::DimX, dim_manual_loop);
 
     Iterator input_it(input, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, window);
+    Iterator output_it(output, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -83,38 +140,80 @@ void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights,
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(execution_window, [&](const Coordinates & id)
     {
-        VectorType acc = wrapper::vdup_n(static_cast<T>(0), TagType{});
+        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
 
-        const int input_y      = id.y() * conv_stride_x - conv_pad_left;
-        const int input_z      = id.z() * conv_stride_y - conv_pad_top;
-        int       input_offset = input_y * input_stride_y + input_z * input_stride_z;
+        auto const base_weights_ptr = weights_it.ptr();
+        uint32_t   x                = run_info.x_start;
 
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < weights_height; ++h)
+        for(; x < run_info.x_leftover_start; x += run_info.x_step)
         {
-            int offs = input_offset;
-            for(size_t w = 0; w < weights_width; ++w)
+            VectorType acc          = zero_vector;
+            auto       weights_ptr  = base_weights_ptr;
+            int64_t    input_offset = base_input_offset;
+
+            for(uint32_t h = 0; h < run_info.weights_height; ++h)
             {
-                const auto input_vals   = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
-                const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * weights_stride_y));
+                int64_t offs = input_offset + x * sizeof(T);
+                for(uint32_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_vals      = is_valid_region ?
+                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
+                                                 zero_vector;
+                    const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+                    acc                     = wrapper::vmla(acc, weights_vals, input_vals);
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
 
-                acc = wrapper::vmla(acc, weights_vals, input_vals);
-                offs += dilation.x() * input_stride_y;
+            if(has_biases)
+            {
+                const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                acc                    = wrapper::vadd(acc, biases_vals);
             }
 
-            weights_ptr += weights_stride_z;
-            input_offset += dilation.y() * input_stride_z;
+            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
         }
 
-        if(has_biases)
+        for(; x < run_info.x_end; ++x)
         {
-            const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()));
-            acc                    = wrapper::vadd(acc, biases_vals);
-        }
+            auto    acc_scalar   = T{ 0 };
+            auto    weights_ptr  = base_weights_ptr;
+            int64_t input_offset = base_input_offset;
+
+            for(size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int64_t offs = input_offset + x * sizeof(T);
+                for(size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_vals      = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
+                    const auto weights_vals    = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                    acc_scalar += (input_vals * weights_vals);
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
 
-        wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), acc);
+            if(has_biases)
+            {
+                const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                acc_scalar += biases_vals;
+            }
+            *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
+        }
     },
     input_it, weights_it, biases_it, output_it);
 }
@@ -123,31 +222,28 @@ template <typename T>
 void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
                                const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
 {
-    const size_t input_stride_y   = input->info()->strides_in_bytes().y();
-    const size_t input_stride_z   = input->info()->strides_in_bytes().z();
-    const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
-                                    input->info()->strides_in_bytes().y();
-    const size_t weights_width    = weights->info()->dimension(1);
-    const size_t weights_height   = weights->info()->dimension(2);
-    const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
-    const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
-    const size_t conv_stride_x    = conv_info.stride().first;
-    const size_t conv_stride_y    = conv_info.stride().second;
-    const size_t conv_pad_left    = conv_info.pad_left();
-    const size_t conv_pad_top     = conv_info.pad_top();
+    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    Window win_input = window;
-    win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    Window execution_window = window;
+    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
 
-    Window win_weights = win_input;
-    win_weights.set(3, Window::Dimension(0, 0, 0));
+    Window win_input = execution_window;
+    win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
 
-    win_input.set_dimension_step(Window::DimX, 1);
+    Window win_weights = window;
+    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+    win_weights.set(Window::DimY, dim_manual_loop);
+    win_weights.set(Window::DimZ, dim_manual_loop);
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set_dimension_step(Window::DimX, run_info.x_step);
 
     Iterator input_it(input, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, window);
+    Iterator output_it(output, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -155,33 +251,34 @@ void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, con
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(execution_window, [&](const Coordinates & id)
     {
         std::vector<T> acc(depth_multiplier, static_cast<T>(0));
 
-        const int input_y      = id.y() * conv_stride_x - conv_pad_left;
-        const int input_z      = id.z() * conv_stride_y - conv_pad_top;
-        int       input_offset = input_y * input_stride_y + input_z * input_stride_z;
+        const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
 
         auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < weights_height; ++h)
+        for(size_t h = 0; h < run_info.weights_height; ++h)
         {
             int offs = input_offset;
-            for(size_t w = 0; w < weights_width; ++w)
+            for(size_t w = 0; w < run_info.weights_width; ++w)
             {
-                const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
+                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
 
                 for(size_t m = 0; m < depth_multiplier; ++m)
                 {
-                    const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
+                    const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
                     acc.at(m)              = support::cpp11::fma(weights_val, input_val, acc.at(m));
                 }
 
-                offs += dilation.x() * input_stride_y;
+                offs += dilation.x() * run_info.input_stride_y;
             }
 
-            weights_ptr += weights_stride_z;
-            input_offset += dilation.y() * input_stride_z;
+            weights_ptr += run_info.weights_stride_z;
+            input_offset += dilation.y() * run_info.input_stride_z;
         }
 
         if(has_biases)
@@ -203,41 +300,43 @@ void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, con
     input_it, weights_it, biases_it, output_it);
 }
 
-template <typename T, typename TW, int S>
+template <typename T, typename TW>
 void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
                                           const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
 {
-    using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
-    using TagType    = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    const size_t input_stride_y   = input->info()->strides_in_bytes().y();
-    const size_t input_stride_z   = input->info()->strides_in_bytes().z();
-    const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
-                                    input->info()->strides_in_bytes().y();
-    const size_t weights_width    = weights->info()->dimension(1);
-    const size_t weights_height   = weights->info()->dimension(2);
-    const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
-    const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
-    const size_t conv_stride_x    = conv_info.stride().first;
-    const size_t conv_stride_y    = conv_info.stride().second;
-    const size_t conv_pad_left    = conv_info.pad_left();
-    const size_t conv_pad_top     = conv_info.pad_top();
+    constexpr auto element_per_vector = vector_size / sizeof(T);
+    using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
+    using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
+    using AccType                     = int32_t;
+    using AccArrayType                = std::array<AccType, element_per_vector>;
+
+    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
+    const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
 
     const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
     const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
     const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
-    const int32_t k_offset        = weights_width * weights_height * input_qoffset * weights_qoffset;
+    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
+
+    Window execution_window = window;
+    execution_window.set(Window::DimX, dim_single_unit_step);
 
     Window win_input = window;
-    win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    win_input.set(Window::DimX, dim_manual_loop);
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
 
     Window win_weights = win_input;
-    win_weights.set(3, Window::Dimension(0, 0, 0));
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set(Window::DimX, dim_manual_loop);
 
     Iterator input_it(input, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, window);
+    Iterator output_it(output, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -245,65 +344,134 @@ void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *w
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(execution_window, [&](const Coordinates & id)
     {
-        std::vector<int32_t> acc(S, 0);
-        std::vector<int32_t> in_sum(S, 0);
-        std::vector<int32_t> we_sum(S, 0);
+        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+        auto const    base_weights_ptr  = weights_it.ptr();
+        size_t        x                 = run_info.x_start;
 
-        const int input_y      = id.y() * conv_stride_x - conv_pad_left;
-        const int input_z      = id.z() * conv_stride_y - conv_pad_top;
-        int       input_offset = input_y * input_stride_y + input_z * input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < weights_height; ++h)
+        for(; x < run_info.x_leftover_start; x += run_info.x_step)
         {
-            int offs = input_offset;
-            for(size_t w = 0; w < weights_width; ++w)
+            AccArrayType acc{};
+            AccArrayType in_sum{};
+            AccArrayType we_sum{};
+
+            auto weights_ptr  = base_weights_ptr;
+            auto input_offset = base_input_offset;
+
+            for(size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int64_t offs = input_offset + x * sizeof(T);
+                for(size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_vals      = is_valid_region ?
+                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
+                                                 out_of_bound_vector;
+                    const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                    for(size_t i = 0; i < element_per_vector; ++i)
+                    {
+                        acc.at(i) += input_vals[i] * weights_vals[i];
+                        in_sum.at(i) += input_vals[i];
+                        we_sum.at(i) += weights_vals[i];
+                    }
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
+            for(size_t i = 0; i < element_per_vector; ++i)
             {
-                const auto input_vals   = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
-                const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * weights_stride_y));
+                acc.at(i) -= in_sum.at(i) * weights_qoffset;
+                acc.at(i) -= we_sum.at(i) * input_qoffset;
+                acc.at(i) += k_offset;
 
-                for(int i = 0; i < S; ++i)
+                if(has_biases)
                 {
-                    acc.at(i) += input_vals[i] * weights_vals[i];
-                    in_sum.at(i) += input_vals[i];
-                    we_sum.at(i) += weights_vals[i];
+                    acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
                 }
 
-                offs += dilation.x() * input_stride_y;
+                const int32_t out_mul   = output_multiplier.at(x + i);
+                const int32_t out_shift = output_shift.at(x + i);
+                if(out_shift < 0)
+                {
+                    acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                }
+                else
+                {
+                    acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
+                }
+                out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
             }
 
-            weights_ptr += weights_stride_z;
-            input_offset += dilation.y() * input_stride_z;
+            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
         }
 
-        VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
-        for(int i = 0; i < S; ++i)
+        // left-over
+        for(; x < run_info.x_end; ++x)
         {
-            acc.at(i) -= in_sum.at(i) * weights_qoffset;
-            acc.at(i) -= we_sum.at(i) * input_qoffset;
-            acc.at(i) += k_offset;
+            AccType acc    = 0;
+            AccType in_sum = 0;
+            AccType we_sum = 0;
+
+            auto weights_ptr  = base_weights_ptr;
+            auto input_offset = base_input_offset;
+
+            for(size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int64_t offs = input_offset + x * sizeof(T);
+                for(size_t w = 0; w < run_info.weights_width; ++w)
+                {
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_val       = is_valid_region ?
+                                                 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
+                                                 out_of_bound_value;
+                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                    acc += input_val * weights_val;
+                    in_sum += input_val;
+                    we_sum += weights_val;
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            T out_vals{ 0 };
+
+            acc -= in_sum * weights_qoffset;
+            acc -= we_sum * input_qoffset;
+            acc += k_offset;
 
             if(has_biases)
             {
-                acc.at(i) += *reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t));
+                acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
             }
 
-            const int out_mul   = output_multiplier.at(id.x() + i);
-            const int out_shift = output_shift.at(id.x() + i);
+            const int32_t out_mul   = output_multiplier.at(x);
+            const int32_t out_shift = output_shift.at(x);
+
             if(out_shift < 0)
             {
-                acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
             }
             else
             {
-                acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
+                acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
             }
-            out_vals[i] = static_cast<T>(utility::clamp<int32_t, T>(acc.at(i)));
-        }
 
-        wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), out_vals);
+            out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
+            *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
+        }
     },
     input_it, weights_it, biases_it, output_it);
 }
@@ -312,36 +480,36 @@ template <typename T, typename TW>
 void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
                                       const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
 {
-    const size_t input_stride_y   = input->info()->strides_in_bytes().y();
-    const size_t input_stride_z   = input->info()->strides_in_bytes().z();
-    const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
-                                    input->info()->strides_in_bytes().y();
-    const size_t weights_width    = weights->info()->dimension(1);
-    const size_t weights_height   = weights->info()->dimension(2);
-    const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
-    const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
-    const size_t conv_stride_x    = conv_info.stride().first;
-    const size_t conv_stride_y    = conv_info.stride().second;
-    const size_t conv_pad_left    = conv_info.pad_left();
-    const size_t conv_pad_top     = conv_info.pad_top();
+    using AccType = int32_t;
+
+    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+
+    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
 
     const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
     const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
     const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
-    const int32_t k_offset        = weights_width * weights_height * input_qoffset * weights_qoffset;
+    const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
 
-    Window win_input = window;
-    win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
-    win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    Window execution_window = window;
+    execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
 
-    Window win_weights = win_input;
-    win_weights.set(3, Window::Dimension(0, 0, 0));
+    Window win_input = execution_window;
+    win_input.set(Window::DimY, dim_manual_loop);
+    win_input.set(Window::DimZ, dim_manual_loop);
 
-    win_input.set_dimension_step(Window::DimX, 1);
+    Window win_weights = window;
+    win_weights.set_dimension_step(Window::DimX, run_info.x_step);
+    win_weights.set(Window::DimY, dim_manual_loop);
+    win_weights.set(Window::DimZ, dim_manual_loop);
+    win_weights.set(Window::DimW, dim_manual_loop);
+
+    Window win_output = window;
+    win_output.set_dimension_step(Window::DimX, run_info.x_step);
 
     Iterator input_it(input, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, window);
+    Iterator output_it(output, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -349,38 +517,39 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(execution_window, [&](const Coordinates & id)
     {
-        std::vector<int32_t> acc(depth_multiplier, 0);
-        std::vector<int32_t> we_sum(depth_multiplier, 0);
-        int32_t              in_sum = 0;
+        std::vector<AccType> acc(depth_multiplier, 0);
+        std::vector<AccType> we_sum(depth_multiplier, 0);
+        AccType              in_sum = 0;
 
-        const int input_y      = id.y() * conv_stride_x - conv_pad_left;
-        const int input_z      = id.z() * conv_stride_y - conv_pad_top;
-        int       input_offset = input_y * input_stride_y + input_z * input_stride_z;
+        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
 
         auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < weights_height; ++h)
+        for(size_t h = 0; h < run_info.weights_height; ++h)
         {
             int offs = input_offset;
-            for(size_t w = 0; w < weights_width; ++w)
+            for(size_t w = 0; w < run_info.weights_width; ++w)
             {
-                const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
+                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
 
                 for(size_t m = 0; m < depth_multiplier; ++m)
                 {
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
+                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
                     acc.at(m) += input_val * weights_val;
 
                     we_sum.at(m) += weights_val;
                 }
 
-                offs += dilation.x() * input_stride_y;
+                offs += dilation.x() * run_info.input_stride_y;
                 in_sum += input_val;
             }
 
-            weights_ptr += weights_stride_z;
-            input_offset += dilation.y() * input_stride_z;
+            weights_ptr += run_info.weights_stride_z;
+            input_offset += dilation.y() * run_info.input_stride_z;
         }
 
         for(size_t m = 0; m < depth_multiplier; ++m)
@@ -394,8 +563,8 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh
                 acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
             }
 
-            const int out_mul   = output_multiplier.at(id.x() + m);
-            const int out_shift = output_shift.at(id.x() + m);
+            const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
+            const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
             if(out_shift < 0)
             {
                 acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
@@ -404,7 +573,7 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh
             {
                 acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
             }
-            *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<int32_t, T>(acc.at(m)));
+            *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
         }
     },
     input_it, weights_it, biases_it, output_it);
@@ -458,54 +627,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *biases,
-                                                        ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                        unsigned int depth_multiplier, const Size2D &dilation)
-{
-    // Get convolved dimensions
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
-
-    // Configure kernel window (generic)
-    const unsigned int num_elems_read_per_iteration    = (depth_multiplier == 1) ? 8 / element_size_from_data_type(input->data_type()) : 1;
-    const unsigned int num_elems_written_per_iteration = num_elems_read_per_iteration * depth_multiplier;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
-
-    AccessWindowStatic input_access(input, 0, -conv_info.pad_left(), ceil_to_multiple(num_elems_read_per_iteration, input->dimension(0)),
-                                    input->dimension(1) + std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()));
-    AccessWindowHorizontal weights_access(weights, 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-
-    if(biases != nullptr)
-    {
-        AccessWindowHorizontal biases_access(biases, 0, num_elems_written_per_iteration);
-        window_changed |= update_window_and_padding(win, biases_access);
-    }
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 NEDepthwiseConvolutionLayerNativeKernel::NEDepthwiseConvolutionLayerNativeKernel()
-    : _func(), _border_size(0), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
+    : _func(), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
 {
 }
 
-BorderSize NEDepthwiseConvolutionLayerNativeKernel::border_size() const
-{
-    return _border_size;
-}
-
 void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
                                                         const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
 {
@@ -518,7 +646,6 @@ void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, co
     _output           = output;
     _conv_info        = conv_info;
     _depth_multiplier = depth_multiplier;
-    _border_size      = BorderSize(_conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
     _dilation         = dilation;
     _has_biases       = (biases != nullptr);
 
@@ -530,17 +657,17 @@ void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, co
         auto weights_scale = weights->info()->quantization_info().scale();
         if(!is_data_type_quantized_per_channel(_weights->info()->data_type()))
         {
-            for(size_t i = 1; i < _weights->info()->dimension(0); ++i)
+            for(size_t i = 1; i < _weights->info()->dimension(channel_idx); ++i)
             {
                 weights_scale.push_back(weights_scale.front());
             }
         }
 
-        for(size_t i = 0; i < weights_scale.size(); ++i)
+        for(const auto &s : weights_scale)
         {
             int32_t     out_mult   = 0;
             int32_t     out_shift  = 0;
-            const float multiplier = input_scale * weights_scale.at(i) / output_scale;
+            const float multiplier = input_scale * s / output_scale;
             arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
 
             _output_multiplier.push_back(out_mult);
@@ -551,42 +678,42 @@ void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, co
     switch(_weights->info()->data_type())
     {
         case DataType::QASYMM8:
-            _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t, 8>;
-            pad_vectors(_output_multiplier, _output_shift, 8);
+            _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t>;
             break;
         case DataType::QASYMM8_SIGNED:
-            _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8>;
-            pad_vectors(_output_multiplier, _output_shift, 8);
+            _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
             break;
         case DataType::QSYMM8_PER_CHANNEL:
             if(_input->info()->data_type() == DataType::QASYMM8)
             {
-                _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t, 8>;
+                _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t>;
             }
             else
             {
-                _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8>;
+                _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
             }
-            pad_vectors(_output_multiplier, _output_shift, 8);
             break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t, 4>;
-            pad_vectors(_output_multiplier, _output_shift, 4);
+            _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t>;
             break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
-            _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float, 2>;
-            pad_vectors(_output_multiplier, _output_shift, 2);
+            _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float>;
             break;
         default:
             ARM_COMPUTE_ERROR("Data type not supported");
             break;
     }
 
-    auto win_config = validate_and_configure_window(_input->info(), _weights->info(), (biases != nullptr) ? biases->info() : nullptr, _output->info(), _conv_info, _depth_multiplier, dilation);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
+
+    Window      win = calculate_max_window(*output->info(), Steps());
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+    INEKernel::configure(win);
 }
 
 Status NEDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -594,9 +721,6 @@ Status NEDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *inpu
                                                          const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), (biases != nullptr) ? biases->clone().get() : nullptr, output->clone().get(), conv_info,
-                                                              depth_multiplier, dilation)
-                                .first);
     return Status{};
 }
 
@@ -609,12 +733,7 @@ void NEDepthwiseConvolutionLayerNativeKernel::run(const Window &window, const Th
     (this->*_func)(window, _has_biases);
 }
 
-template < typename T, typename TW, int S, typename std::enable_if < std::is_same<T, float>::value
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                                     || std::is_same<T, float16_t>::value
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                                     ,
-                                                                     int >::type >
+template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::FloatEnalber<T>>
 void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -622,7 +741,7 @@ void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window
 
     if(_depth_multiplier == 1)
     {
-        depthwise_loop_multiplier1_fp<T, S>(_input, _weights, _biases, _output, _conv_info, _dilation, window, has_biases);
+        depthwise_loop_multiplier1_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, window, has_biases);
     }
     else
     {
@@ -630,7 +749,7 @@ void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window
     }
 }
 
-template <typename T, typename TW, int S, typename>
+template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::Quantized8bitEnalber<T>>
 void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -638,7 +757,7 @@ void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window
 
     if(_depth_multiplier == 1)
     {
-        depthwise_loop_multiplier1_quantized<T, TW, S>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
+        depthwise_loop_multiplier1_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
     }
     else
     {
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
similarity index 89%
rename from arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
rename to src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
index 2e29234b6f..713cdcd9d9 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
@@ -24,8 +24,9 @@
 #ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 #define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/utils/misc/Requires.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/INEKernel.h"
+#include "support/Requires.h"
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include <arm_neon.h>
@@ -54,6 +55,8 @@ class NEDepthwiseConvolutionLayerNativeKernel : public INEKernel
     NEDepthwiseConvolutionLayerNativeKernel(NEDepthwiseConvolutionLayerNativeKernel &&) = default;
     /** Default move assignment operator */
     NEDepthwiseConvolutionLayerNativeKernel &operator=(NEDepthwiseConvolutionLayerNativeKernel &&) = default;
+    /** Default destructor */
+    ~NEDepthwiseConvolutionLayerNativeKernel() = default;
     /** Initialize the function's source, destination and parameters.
      *
      * @note Supported data layouts: NHWC
@@ -92,18 +95,18 @@ class NEDepthwiseConvolutionLayerNativeKernel : public INEKernel
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
 
 private:
-    template < typename T, typename TW, int S, typename std::enable_if < std::is_same<T, float>::value
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                                         || std::is_same<T, float16_t>::value
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                                         ,
-                                                                         int >::type = 0 >
+    template <typename T>
+    using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type;
+
+    template <typename T, typename TW, FloatEnalber<T> = 0>
     void run_depthwise(const Window &window, bool has_biases);
 
-    template < typename T, typename TW, int S, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+    template <typename T>
+    using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type;
+
+    template <typename T, typename TW, Quantized8bitEnalber<T> = 0>
     void run_depthwise(const Window &window, bool has_biases);
 
     /** Common signature for all the specialised depthwise convolution native functions
@@ -113,7 +116,6 @@ class NEDepthwiseConvolutionLayerNativeKernel : public INEKernel
     using DepthwiseFunctionPtr = void (NEDepthwiseConvolutionLayerNativeKernel::*)(const Window &window, bool has_biases);
 
     DepthwiseFunctionPtr _func;
-    BorderSize           _border_size;
     const ITensor       *_input;
     const ITensor       *_weights;
     const ITensor       *_biases;
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index fc0933bcd1..36e9c92c56 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEDequantizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NESymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h b/src/core/NEON/kernels/NEDequantizationLayerKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
rename to src/core/NEON/kernels/NEDequantizationLayerKernel.h
index 7b97d06e43..9cc71922af 100644
--- a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEDEQUANTIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_NEDEQUANTIZATIONLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
index ad590e9f2b..8d641a33b9 100644
--- a/src/core/NEON/kernels/NEDerivativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
+#include "src/core/NEON/kernels/NEDerivativeKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -29,6 +29,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h b/src/core/NEON/kernels/NEDerivativeKernel.h
similarity index 96%
rename from arm_compute/core/NEON/kernels/NEDerivativeKernel.h
rename to src/core/NEON/kernels/NEDerivativeKernel.h
index 7a46a4194e..112b2b0b28 100644
--- a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
+++ b/src/core/NEON/kernels/NEDerivativeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEDERIVATIVEKERNEL_H
 #define ARM_COMPUTE_NEDERIVATIVEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -50,6 +50,8 @@ class NEDerivativeKernel : public INEKernel
     NEDerivativeKernel(NEDerivativeKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEDerivativeKernel &operator=(NEDerivativeKernel &&) = default;
+    /** Default destructor */
+    ~NEDerivativeKernel() = default;
     /** Initialise the kernel's sources, destination and border
      *
      * @note At least one of output_x or output_y must be set
diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp
index c30dab22c6..dc9ec22c71 100644
--- a/src/core/NEON/kernels/NEDilateKernel.cpp
+++ b/src/core/NEON/kernels/NEDilateKernel.cpp
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
+#include "src/core/NEON/kernels/NEDilateKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEDilateKernel.h b/src/core/NEON/kernels/NEDilateKernel.h
similarity index 74%
rename from arm_compute/core/NEON/kernels/NEDilateKernel.h
rename to src/core/NEON/kernels/NEDilateKernel.h
index 424cf549a1..f1d34318ed 100644
--- a/arm_compute/core/NEON/kernels/NEDilateKernel.h
+++ b/src/core/NEON/kernels/NEDilateKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEDILATEKERNEL_H
 #define ARM_COMPUTE_NEDILATEKERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -38,6 +38,18 @@ class NEDilateKernel : public INESimpleKernel
     {
         return "NEDilateKernel";
     }
+    /** Default constructor */
+    NEDilateKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDilateKernel(const NEDilateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDilateKernel &operator=(const NEDilateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDilateKernel(NEDilateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDilateKernel &operator=(NEDilateKernel &&) = default;
+    /** Default destructor */
+    ~NEDilateKernel() = default;
     /** Set the source, destination and border mode of the kernel
      *
      * @param[in]  input            Source tensor. Data type supported: U8
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 559b67316f..87b9fb1bf1 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -21,28 +21,31 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include <algorithm>
-#include <arm_neon.h>
 
-using namespace arm_compute;
 using namespace arm_compute::detail;
 
+namespace arm_compute
+{
 namespace
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -472,117 +475,6 @@ inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const
     return out;
 }
 
-template <typename T1>
-class convolver_nhwc
-{
-public:
-    static void convolve(const Window &window, uint32_t kernel_size, unsigned int num_elems_read_per_iteration,
-                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
-    {
-        const int          input_width     = input->info()->dimension(0);
-        const int          input_depth     = input->info()->dimension(2);
-        const int          input_stride_x  = input->info()->strides_in_bytes().x();
-        const int          input_stride_y  = input->info()->strides_in_bytes().y();
-        const int          input_stride_z  = input->info()->strides_in_bytes().z();
-        const int          output_stride_x = output->info()->strides_in_bytes().x();
-        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
-        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
-        const int          conv_pad_top    = conv_info.pad_top();
-        const unsigned int conv_stride_x   = std::get<0>(conv_info.stride());
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const T1           zero            = 0;
-
-        // Setup input window for the input iterator
-        Window window_in = window;
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        // Setup input window for the output iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        // Setup input window for the weights iterator
-        Window window_k = calculate_max_window(*weights->info(), Steps());
-        window_k.set(Window::DimX, Window::Dimension(0, 1, 1));
-        window_k.set(Window::DimY, Window::Dimension(0, 1, 1));
-        window_k.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        window_k.set(3, Window::Dimension(0, weights->info()->dimension(3), 1));
-
-        Iterator in(input, window_in);
-        Iterator out(output, window_out);
-        Iterator k(weights, window_k);
-
-        execute_window_loop(window_k, [&](const Coordinates & id_k)
-        {
-            execute_window_loop(window_out, [&](const Coordinates & id)
-            {
-                const auto in_y = static_cast<int>(id.y() * conv_stride_x - conv_info.pad_left());
-                const auto in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top);
-
-                const uint8_t *in_ptr  = in.ptr() + in_y * input_stride_y + in_z * input_stride_z;
-                uint8_t       *out_ptr = out.ptr() + id_k[3] * output_stride_x;
-
-                T1 out_val = 0;
-
-                auto in_addr_base0 = in_ptr;
-                auto we_addr_base0 = k.ptr();
-
-                for(uint32_t z = 0; z < kernel_size; ++z, in_addr_base0 += input_stride_z, we_addr_base0 += kernel_stride_z)
-                {
-                    const int in_z = id.z() * conv_stride_y + z - conv_pad_top;
-
-                    if(in_z >= 0 && in_z < input_depth) // If false, pad top/bottom
-                    {
-                        auto in_addr_base1 = in_addr_base0;
-                        auto we_addr_base1 = we_addr_base0;
-
-                        for(uint32_t y = 0; y < kernel_size; ++y, in_addr_base1 += input_stride_y, we_addr_base1 += kernel_stride_y)
-                        {
-                            auto out_values = internal_vdupq_n(zero);
-
-                            int x           = 0;
-                            int no_leftover = input_width - num_elems_read_per_iteration;
-
-                            for(; x < no_leftover; x += num_elems_read_per_iteration)
-                            {
-                                const auto in_addr   = reinterpret_cast<const T1 *>(in_addr_base1 + x * input_stride_x);
-                                const auto in_values = internal_vld1q<1>(in_addr);
-
-                                const auto we_addr   = reinterpret_cast<const T1 *>(we_addr_base1 + x * kernel_stride_x);
-                                const auto we_values = internal_vld1q<1>(we_addr);
-
-                                out_values = internal_vmlal(out_values, in_values, we_values);
-                            }
-
-                            auto carry_addition = wrapper::vpadd(wrapper::vgethigh(out_values), wrapper::vgetlow(out_values));
-                            carry_addition      = wrapper::vpadd(carry_addition, carry_addition);
-                            out_val += wrapper::vgetlane(carry_addition, 0);
-
-                            // Leftover
-                            for(; x < input_width; ++x)
-                            {
-                                const auto in_addr  = reinterpret_cast<const T1 *>(in_addr_base1 + x * input_stride_x);
-                                const auto in_value = *(in_addr);
-
-                                const auto we_addr  = reinterpret_cast<const T1 *>(we_addr_base1 + x * kernel_stride_x);
-                                const auto we_value = *(we_addr);
-
-                                out_val += in_value * we_value;
-                            }
-                        }
-                    }
-                }
-
-                *(reinterpret_cast<T1 *>(out_ptr)) = out_val;
-            },
-            in, out);
-        },
-        k);
-    }
-};
-
 template <typename T1, typename T2, unsigned int stridex>
 class convolver_3x3
 {
@@ -815,76 +707,6 @@ class convolver_5x5
     }
 };
 
-inline void convolve_row1x9_nhwc(const float *row_ptr, const float *weights_ptr, size_t src_stride_y, size_t weights_stride_y,
-                                 float32x4_t &acc0, float32x4_t &acc1, float32x4_t &acc2, float32x4_t &acc3)
-{
-    // Load 4 channels for each of the 12 inputs values along the same X spatial dimension
-    const float32x4_t src0  = wrapper::vloadq(row_ptr);
-    const float32x4_t src1  = wrapper::vloadq(row_ptr + 1 * src_stride_y);
-    const float32x4_t src2  = wrapper::vloadq(row_ptr + 2 * src_stride_y);
-    const float32x4_t src3  = wrapper::vloadq(row_ptr + 3 * src_stride_y);
-    const float32x4_t src4  = wrapper::vloadq(row_ptr + 4 * src_stride_y);
-    const float32x4_t src5  = wrapper::vloadq(row_ptr + 5 * src_stride_y);
-    const float32x4_t src6  = wrapper::vloadq(row_ptr + 6 * src_stride_y);
-    const float32x4_t src7  = wrapper::vloadq(row_ptr + 7 * src_stride_y);
-    const float32x4_t src8  = wrapper::vloadq(row_ptr + 8 * src_stride_y);
-    const float32x4_t src9  = wrapper::vloadq(row_ptr + 9 * src_stride_y);
-    const float32x4_t src10 = wrapper::vloadq(row_ptr + 10 * src_stride_y);
-    const float32x4_t src11 = wrapper::vloadq(row_ptr + 11 * src_stride_y);
-
-    // Load 4 channels for each of the 9 weights values along the same X spatial dimension
-    const float32x4_t w0 = wrapper::vloadq(weights_ptr);
-    const float32x4_t w1 = wrapper::vloadq(weights_ptr + 1 * weights_stride_y);
-    const float32x4_t w2 = wrapper::vloadq(weights_ptr + 2 * weights_stride_y);
-    const float32x4_t w3 = wrapper::vloadq(weights_ptr + 3 * weights_stride_y);
-    const float32x4_t w4 = wrapper::vloadq(weights_ptr + 4 * weights_stride_y);
-    const float32x4_t w5 = wrapper::vloadq(weights_ptr + 5 * weights_stride_y);
-    const float32x4_t w6 = wrapper::vloadq(weights_ptr + 6 * weights_stride_y);
-    const float32x4_t w7 = wrapper::vloadq(weights_ptr + 7 * weights_stride_y);
-    const float32x4_t w8 = wrapper::vloadq(weights_ptr + 8 * weights_stride_y);
-
-    // Store 4 channels for each of the 4 output values along the same X spatial dimension
-    acc0 = wrapper::vmla(acc0, w0, src0);
-    acc0 = wrapper::vmla(acc0, w1, src1);
-    acc0 = wrapper::vmla(acc0, w2, src2);
-    acc0 = wrapper::vmla(acc0, w3, src3);
-    acc0 = wrapper::vmla(acc0, w4, src4);
-    acc0 = wrapper::vmla(acc0, w5, src5);
-    acc0 = wrapper::vmla(acc0, w6, src6);
-    acc0 = wrapper::vmla(acc0, w7, src7);
-    acc0 = wrapper::vmla(acc0, w8, src8);
-
-    acc1 = wrapper::vmla(acc1, w0, src1);
-    acc1 = wrapper::vmla(acc1, w1, src2);
-    acc1 = wrapper::vmla(acc1, w2, src3);
-    acc1 = wrapper::vmla(acc1, w3, src4);
-    acc1 = wrapper::vmla(acc1, w4, src5);
-    acc1 = wrapper::vmla(acc1, w5, src6);
-    acc1 = wrapper::vmla(acc1, w6, src7);
-    acc1 = wrapper::vmla(acc1, w7, src8);
-    acc1 = wrapper::vmla(acc1, w8, src9);
-
-    acc2 = wrapper::vmla(acc2, w0, src2);
-    acc2 = wrapper::vmla(acc2, w1, src3);
-    acc2 = wrapper::vmla(acc2, w2, src4);
-    acc2 = wrapper::vmla(acc2, w3, src5);
-    acc2 = wrapper::vmla(acc2, w4, src6);
-    acc2 = wrapper::vmla(acc2, w5, src7);
-    acc2 = wrapper::vmla(acc2, w6, src8);
-    acc2 = wrapper::vmla(acc2, w7, src9);
-    acc2 = wrapper::vmla(acc2, w8, src10);
-
-    acc3 = wrapper::vmla(acc3, w0, src3);
-    acc3 = wrapper::vmla(acc3, w1, src4);
-    acc3 = wrapper::vmla(acc3, w2, src5);
-    acc3 = wrapper::vmla(acc3, w3, src6);
-    acc3 = wrapper::vmla(acc3, w4, src7);
-    acc3 = wrapper::vmla(acc3, w5, src8);
-    acc3 = wrapper::vmla(acc3, w6, src9);
-    acc3 = wrapper::vmla(acc3, w7, src10);
-    acc3 = wrapper::vmla(acc3, w8, src11);
-}
-
 float vreduce(const float32x4_t &v)
 {
     auto v0    = wrapper::vgethigh(v);
@@ -896,175 +718,6 @@ float vreduce(const float32x4_t &v)
     return a + b;
 }
 
-template <typename V>
-class convolver_9x9_nhwc
-{
-public:
-    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration,
-                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
-    {
-        // Declare useful types
-        using vector_type = typename V::type;
-        using scalar_type = typename V::scalar_type;
-        using tag_type    = typename V::tag_type;
-
-        // Scalar quantities
-        const int          element_size    = input->info()->element_size();
-        const int          input_width     = input->info()->dimension(0);
-        const int          input_depth     = input->info()->dimension(2);
-        const int          input_stride_y  = input->info()->strides_in_bytes().y() / element_size;
-        const int          input_stride_z  = input->info()->strides_in_bytes().z() / element_size;
-        const int          input_stride_w  = input->info()->strides_in_bytes()[3];
-        const int          output_stride_x = output->info()->strides_in_bytes().x();
-        const int          output_stride_y = output->info()->strides_in_bytes().y();
-        const int          kernel_stride_y = weights->info()->strides_in_bytes().y() / element_size;
-        const int          kernel_stride_z = weights->info()->strides_in_bytes().z() / element_size;
-        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_top    = conv_info.pad_top();
-        const unsigned int conv_pad_left   = conv_info.pad_left();
-
-        // Setup input window for the input iterator
-        Window window_in = window;
-        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        // Setup input window for the output iterator
-        Window window_out = window;
-        window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        // Setup input window for the weights iterator
-        Window window_k = calculate_max_window(*weights->info(), Steps());
-        window_k.set(Window::DimX, Window::Dimension(0, 1, 1));
-        window_k.set(Window::DimY, Window::Dimension(0, 1, 1));
-        window_k.set(Window::DimZ, Window::Dimension(0, 1, 1));
-        window_k.set(3, Window::Dimension(0, weights->info()->dimension(3), 1));
-
-        Iterator in(input, window_in);
-        Iterator out(output, window_out);
-        Iterator k(weights, window_k);
-
-        // Calculate the max_offset.
-        // max_offset is the offset for the last NOT valid value in the Z dimension (spatial dimension Y for NHWC)
-        //  |******************|
-        //  |     pad_top      |
-        //  |******************|
-        //  |                  |
-        //  |      plane0      |
-        //  |      batch0      |
-        //  |__________________|
-        //  |******************|       Batch 0
-        //  |    pad_bottom    |
-        //  |     pad_top      |
-        //  |******************|
-        //  |                  |
-        //  |      plane1      |
-        //  |      batch0      |
-        //  |__________________|-----> max_offset
-        //  |******************|
-        //  |    pad_bottom    |
-        //  |     pad_top      |
-        //  |******************|
-        //  |                  |
-        //  |      plane0      |
-        //  |      batch1      |
-        //  |__________________|
-        //  |******************|       Batch 1
-        //  |    pad_bottom    |
-        //  |     pad_top      |
-        //  |******************|
-        //  |                  |
-        //  |      plane1      |
-        //  |      batch1      |
-        //  |__________________|
-        //  |     pad_bottom   |
-        //  |******************|
-        const int64_t max_offset = input_stride_z * input_depth - (input->info()->padding().bottom + input->info()->padding().top) * input_stride_y;
-        execute_window_loop(window_k, [&](const Coordinates & id_k) // loop on the batch size
-        {
-
-            execute_window_loop(window_out, [&](const Coordinates & id)
-            {
-                const auto y_offset = int(id.y() - conv_pad_left) * input_stride_y;
-
-                // Buffer pointers
-                const scalar_type *in_ptr      = reinterpret_cast<scalar_type *>(input->buffer() + input->info()->offset_first_element_in_bytes() + id[3] * input_stride_w);
-                const scalar_type *weights_ptr = reinterpret_cast<scalar_type *>(k.ptr());
-                uint8_t           *out_ptr     = out.ptr() + id_k[3] * output_stride_x;
-
-                // Output elements
-                vector_type out0 = wrapper::vdup_n(scalar_type(0), tag_type());
-                vector_type out1 = wrapper::vdup_n(scalar_type(0), tag_type());
-                vector_type out2 = wrapper::vdup_n(scalar_type(0), tag_type());
-                vector_type out3 = wrapper::vdup_n(scalar_type(0), tag_type());
-
-                // Reduce along the feature maps
-                for(int x = 0; x < input_width; x += num_elems_read_per_iteration)
-                {
-                    // z == 0
-                    auto in_z   = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top);
-                    in_z        = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth));
-                    auto offset = y_offset + in_z * input_stride_z;
-                    offset      = std::min(offset, max_offset);
-                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 0 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
-
-                    // z == 1
-                    in_z   = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top + 1);
-                    in_z   = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth));
-                    offset = y_offset + in_z * input_stride_z;
-                    offset = std::min(offset, max_offset);
-                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 1 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
-
-                    // z == 2
-                    in_z   = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top + 2);
-                    in_z   = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth));
-                    offset = y_offset + in_z * input_stride_z;
-                    offset = std::min(offset, max_offset);
-                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 2 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
-
-                    // z == 3
-                    in_z   = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top + 3);
-                    offset = y_offset + in_z * input_stride_z;
-                    offset = std::min(offset, max_offset);
-                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 3 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
-
-                    // z == 4
-                    in_z   = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top + 4);
-                    offset = y_offset + in_z * input_stride_z;
-                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 4 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
-
-                    // z == 5
-                    offset += input_stride_z;
-                    offset = std::min(offset, max_offset);
-                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 5 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
-
-                    // z == 6
-                    offset += input_stride_z;
-                    offset = std::min(offset, max_offset);
-                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 6 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
-
-                    // z == 7
-                    offset += input_stride_z;
-                    offset = std::min(offset, max_offset);
-                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 7 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
-
-                    // z == 8
-                    offset += input_stride_z;
-                    offset = std::min(offset, max_offset);
-                    convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 8 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
-                }
-
-                *(reinterpret_cast<scalar_type *>(out_ptr + 0 * output_stride_y)) = vreduce(out0);
-                *(reinterpret_cast<scalar_type *>(out_ptr + 1 * output_stride_y)) = vreduce(out1);
-                *(reinterpret_cast<scalar_type *>(out_ptr + 2 * output_stride_y)) = vreduce(out2);
-                *(reinterpret_cast<scalar_type *>(out_ptr + 3 * output_stride_y)) = vreduce(out3);
-            },
-            in, out);
-        },
-        k);
-    }
-};
-
 template <typename T1, typename T2>
 inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
                          const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
@@ -1169,21 +822,6 @@ inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_i
     }
 }
 
-template <typename V>
-inline void convolve_9x9_nhwc(const Window &window, unsigned int num_elems_read_per_iteration,
-                              const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
-{
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    switch(conv_stride_x)
-    {
-        case 1:
-            convolver_9x9_nhwc<V>::convolve(window, num_elems_read_per_iteration, input, weights, output, conv_info);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-}
-
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
@@ -1337,68 +975,248 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     }
     else
     {
-        if(kernel_size == 9)
-        {
-            border_size.left = 0;
-            border_size.top  = conv_info.pad_left();
+        // Configure window NHWC without any padding
+        win = calculate_max_window(*output, Steps());
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+    }
 
-            const int num_elems_read_per_iteration_x    = 4;
-            const int num_elems_written_per_iteration_x = 1;
-            const int num_elems_read_per_iteration_y    = 12;
-            const int num_elems_written_per_iteration_y = 4;
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
 
-            num_elems_read_per_iteration    = num_elems_read_per_iteration_x;
-            num_elems_written_per_iteration = num_elems_written_per_iteration_x;
+bool have_zero_x_internal_padding(ITensorInfo *input, ITensorInfo *weights)
+{
+    return (input->padding().left == 0 && weights->padding().left == 0 && input->padding().right == 0 && weights->padding().right == 0);
+}
 
-            border_size.right = num_elems_read_per_iteration_x;
-            if((conv_info.pad_bottom() != 0) || (conv_info.pad_top() != 0))
-            {
-                // If bottom or top padding are set, we need to read num_elems_read_per_iteration_y rows to zero.
-                // Since num_elems_read_per_iteration_y is always greater than conv_info.pad_right() we can set
-                // the bottom padding to num_elems_read_per_iteration_y
-                border_size.bottom = num_elems_read_per_iteration_y;
-            }
-            else if(conv_info.pad_right() != 0)
-            {
-                // Convetional border padding. Fill the bottom paddings so that we can read in batch of num_elems_read_per_iteration_y
-                border_size.bottom = ceil_to_multiple(input->dimension(1) + conv_info.pad_right(), num_elems_read_per_iteration_y) - input->dimension(1);
-            }
-            else
+} // namespace
+
+template <typename T>
+void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &window)
+{
+    // This function assumes that input and weights have not padding in channel
+
+    // Declare useful types
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    // Scalar quantities
+    const int element_size   = _input->info()->element_size();
+    const int input_stride_w = _input->info()->strides_in_bytes().y() / element_size;
+    const int input_stride_h = _input->info()->strides_in_bytes().z() / element_size;
+    const int input_stride_n = _input->info()->strides_in_bytes()[3] / element_size;
+    const int input_dim_w    = _input->info()->dimension(1);
+    const int input_dim_h    = _input->info()->dimension(2);
+
+    const int output_stride_c = _output->info()->strides_in_bytes().x();
+
+    const unsigned int kernel_stride_w = _weights->info()->strides_in_bytes().y() / element_size;
+    const unsigned int kernel_stride_h = _weights->info()->strides_in_bytes().z() / element_size;
+    const int          kernel_dim_w    = _weights->info()->dimension(1);
+    const int          kernel_dim_h    = _weights->info()->dimension(2);
+
+    const int conv_pad_top  = _conv_info.pad_top();
+    const int conv_pad_left = _conv_info.pad_left();
+    const int conv_stride_w = std::get<0>(_conv_info.stride());
+    const int conv_stride_h = std::get<1>(_conv_info.stride());
+
+    // Setup input window for the output iterator
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Setup input window for the weights iterator
+    Window window_w = calculate_max_window(*_weights->info(), Steps());
+    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    Iterator out(_output, window_out);
+    Iterator wei(_weights, window_w);
+
+    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+    /*
+     * This implementation parallelize the full WC plane of input and weights by
+     * treating them as series of elements. So for example, a 3x3 weights and
+     * floating point vector operations of 4 elements per time, the first 3
+     * channel elements of the first row would be taken and additionally the first
+     * element of the second row. The 9 elements in each single WC weight plane
+     * would require 2 4-element vector operations and a last single element operation.
+     *
+     * This works since when we create the input vector to multiply with the weights,
+     * the exact required elements are loaded in the same order. Therefore the
+     * multiplication works on the correct input/weight elements.
+     */
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        /*
+         * In here we create theoretical indexes which then we validate for both
+         * inputs and weights.
+         * As a reminder, this loop take each output point in NHW, C is treated
+         * in the weights loop.
+         */
+        // We are computing the theoretical starting input starting points
+        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+        // We are computing the valid initial and ending input points by checking the borders
+        const int in_w_start = std::max(in_w_start_t, 0);
+        const int in_h_start = std::max(in_h_start_t, 0);
+        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+        // We use the input points to select the valid weight points to use
+        const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
+        const int index_h_start  = in_h_start - in_h_start_t;
+        const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
+        const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
+
+        execute_window_loop(window_w, [&](const Coordinates & id_w)
+        {
+            /*
+             * This is the loop in the weights, and it goes along N (the batches)
+             * As a reminder, the batches of the weights are translated into the
+             * channels of the output
+             */
+            const T *in_ptr_row = reinterpret_cast<const T *>(_input->buffer() + _input->info()->offset_first_element_in_bytes())
+                                  + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
+            const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
+            uint8_t *out_ptr         = out.ptr() + id_w[3] * output_stride_c;
+
+            T out_temp = static_cast<T>(0);
+            for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
             {
-                // No padding
-                border_size.bottom = 0;
+                const T    *in_ptr_mover = in_ptr_row;
+                int         index_wc     = index_wc_start;
+                vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                {
+                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                    const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
+                    out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                }
+                out_temp += vreduce(out_temp_vec);
+                for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
+                {
+                    const auto src_val = *(in_ptr_mover);
+                    const auto w_val   = *(weights_ptr_row + index_wc);
+                    out_temp += src_val * w_val;
+                }
             }
+            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+        },
+        wei);
+    },
+    out);
+}
 
-            win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
-            AccessWindowStatic input_access(input, 0, -border_size.top,
-                                            ceil_to_multiple(input->dimension(0), num_elems_read_per_iteration_x),
-                                            input->dimension(1) + border_size.bottom);
-
-            AccessWindowStatic    weights_access(weights, 0, 0, ceil_to_multiple(weights->dimension(0), num_elems_read_per_iteration_x), weights->dimension(1));
-            AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-            window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-        }
-        else
+template <typename T>
+void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window)
+{
+    // Declare useful types
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    // Scalar quantities
+    const int element_size   = _input->info()->element_size();
+    const int input_stride_w = _input->info()->strides_in_bytes().y() / element_size;
+    const int input_stride_h = _input->info()->strides_in_bytes().z() / element_size;
+    const int input_stride_n = _input->info()->strides_in_bytes()[3] / element_size;
+    const int input_dim_w    = _input->info()->dimension(1);
+    const int input_dim_h    = _input->info()->dimension(2);
+
+    const int output_stride_c = _output->info()->strides_in_bytes().x();
+
+    const unsigned int kernel_stride_w = _weights->info()->strides_in_bytes().y() / element_size;
+    const unsigned int kernel_stride_h = _weights->info()->strides_in_bytes().z() / element_size;
+    const int          kernel_dim_w    = _weights->info()->dimension(1);
+    const int          kernel_dim_h    = _weights->info()->dimension(2);
+
+    const int conv_pad_top  = _conv_info.pad_top();
+    const int conv_pad_left = _conv_info.pad_left();
+    const int conv_stride_w = std::get<0>(_conv_info.stride());
+    const int conv_stride_h = std::get<1>(_conv_info.stride());
+
+    // Setup input window for the output iterator
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Setup input window for the weights iterator
+    Window window_w = calculate_max_window(*_weights->info(), Steps());
+    window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
+    window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    Iterator out(_output, window_out);
+    Iterator wei(_weights, window_w);
+
+    constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        // We are computing the theoretical starting input starting points
+        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+        // We are computing the valid initial and ending input points by checking the borders
+        const int in_w_start = std::max(in_w_start_t, 0);
+        const int in_h_start = std::max(in_h_start_t, 0);
+        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+        // We use the input points to select the valid weight points to use
+        const int wei_w_start = in_w_start - in_w_start_t;
+        const int wei_h_start = in_h_start - in_h_start_t;
+        const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+        const int      index_c_end  = _weights->info()->dimension(0);
+        const T *const in_ptr_start = reinterpret_cast<const T *>(_input->buffer() + _input->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
+
+        execute_window_loop(window_w, [&](const Coordinates & id_w)
         {
-            border_size.left             = 0;
-            border_size.top              = conv_info.pad_left();
-            border_size.right            = 0;
-            border_size.bottom           = conv_info.pad_right();
-            num_elems_read_per_iteration = 16 / element_size_from_data_type(input->data_type());
-            win                          = calculate_max_window(*output, Steps());
-
-            AccessWindowRectangle input_access(input, 0, -border_size.top, num_elems_read_per_iteration, kernel_size, 1.f, conv_stride_x);
-            AccessWindowRectangle weights_access(weights, 0, 0, num_elems_read_per_iteration, kernel_size);
-            window_changed = update_window_and_padding(win, input_access, weights_access);
-        }
-    }
+            const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+            uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+            T out_temp = static_cast<T>(0);
+            for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
+            {
+                const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
+                const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
+                for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+                {
+                    const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                    const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                    int         index_c           = 0;
+                    vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                    for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
+                    {
+                        const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                        const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
+                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                    }
+                    out_temp += vreduce(out_temp_vec);
+                    for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
+                    {
+                        const auto src_val = *(in_ptr_mover);
+                        const auto w_val   = *(weights_ptr_mover);
+                        out_temp += src_val * w_val;
+                    }
+                }
+            }
+            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+        },
+        wei);
+    },
+    out);
 }
-} // namespace
 
 NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()
     : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0),
@@ -1425,7 +1243,14 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens
     const unsigned int conv_pad_top    = conv_info.pad_top();
     const unsigned int conv_pad_right  = conv_info.pad_right();
     const unsigned int conv_pad_bottom = conv_info.pad_bottom();
-    _border_size                       = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
+    if(_input->info()->data_layout() == DataLayout::NCHW)
+    {
+        _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
+    }
+    else
+    {
+        _border_size = BorderSize(0);
+    }
 
     // Get convolved dimensions
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
@@ -1536,22 +1361,17 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
     }
     else
     {
-        const int kernel_size = _weights->info()->dimension(get_data_layout_dimension_index(_weights->info()->data_layout(), DataLayoutDimension::WIDTH));
-        const int stride_x    = std::get<0>(_conv_info.stride());
-        const int stride_y    = std::get<1>(_conv_info.stride());
-
         switch(_input->info()->data_type())
         {
             case DataType::F32:
             {
-                if(kernel_size == 9 && stride_x == 1 && stride_y == 1)
+                if(have_zero_x_internal_padding(_input->info(), _weights->info()))
                 {
-                    using vtype = wrapper::traits::neon_vector<float, 4>;
-                    convolve_9x9_nhwc<vtype>(window, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info);
+                    convolve_nhwc_optimized<float>(window);
                 }
                 else
                 {
-                    convolver_nhwc<float>::convolve(window, kernel_size, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info);
+                    convolve_nhwc<float>(window);
                 }
                 break;
             }
@@ -1561,3 +1381,4 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
         }
     }
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
similarity index 93%
rename from arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
rename to src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index 4cb9c90a1a..94c97cf521 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H
 #define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -86,6 +86,14 @@ class NEDirectConvolutionLayerKernel : public INEKernel
     BorderSize border_size() const override;
 
 private:
+    /* Template function for optimized convolution NHWC */
+    template <typename T>
+    void convolve_nhwc_optimized(const Window &window);
+
+    /* Template function for convolution NHWC */
+    template <typename T>
+    void convolve_nhwc(const Window &window);
+
     const ITensor *_input;
     const ITensor *_weights;
     ITensor       *_output;
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 2814c67f70..de5a88e812 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -21,20 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
@@ -86,82 +88,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output,
-                                                        const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-
-    const DataType data_type = input->data_type();
-
-    // Auto-initialize output output if required
-    if(output != nullptr)
-    {
-        // Work out expected output data type
-        const DataType output_dt = (data_type == DataType::S32) ? info.output_data_type : data_type;
-        // Output tensor auto initialization if not yet initialized
-        auto_init_if_empty(*output, input->clone()->set_data_type(output_dt));
-    }
-
-    bool         window_changed                    = false;
-    unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(data_type);
-
-    // Update processed elements when input is S32 (comes from quantization input)
-    if(data_type == DataType::S32)
-    {
-        num_elems_processed_per_iteration = 16;
-    }
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
-    if(output != nullptr && (output->total_size() != 0))
-    {
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-        if(bias == nullptr)
-        {
-            window_changed = update_window_and_padding(win, input_access, output_access);
-        }
-        else
-        {
-            AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
-            window_changed = update_window_and_padding(win, input_access, output_access, bias_access);
-        }
-
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-    else
-    {
-        if(bias == nullptr)
-        {
-            window_changed = update_window_and_padding(win, input_access);
-        }
-        else
-        {
-            if(input->data_layout() == DataLayout::NCHW)
-            {
-                AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
-                window_changed = update_window_and_padding(win, input_access, bias_access);
-            }
-            else
-            {
-                AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration);
-                window_changed = update_window_and_padding(win, input_access, bias_access);
-            }
-        }
-
-        input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-template <typename T, bool has_bias>
+template <typename T>
 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
 output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
 {
     /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
@@ -171,68 +101,123 @@ output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITe
     ARM_COMPUTE_UNUSED(result_shift);
     ARM_COMPUTE_UNUSED(result_offset_after_shift);
 
-    Iterator in(input, window);
-    Iterator out(output, window);
-    execute_window_loop(window, [&](const Coordinates & id)
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / input->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(input, win);
+    Iterator out(output, win);
+    execute_window_loop(win, [&](const Coordinates & id)
     {
-        // Get bias and pointer to input
-        const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
-        auto       v_in   = wrapper::vloadq(in_ptr);
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
+            auto       v_in   = wrapper::vloadq(in_ptr);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
+                v_in          = wrapper::vadd(v_in, vb);
+            }
+
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
+            wrapper::vstore(out_ptr, v_in);
+        }
 
-        // Accumulate bias
-        if(has_bias)
+        // Left-overs loop
+        for(; x < window_end_x; ++x)
         {
-            const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
-            v_in          = wrapper::vadd(v_in, vb);
+            // Get bias and pointer to input
+            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
+                s_in += b;
+            }
+
+            *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
         }
 
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-        wrapper::vstore(out_ptr, v_in);
     },
     in, out);
 }
 
-template <typename T, bool has_bias>
+template <typename T>
 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
 output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
 {
     ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
     ARM_COMPUTE_UNUSED(result_shift);
     ARM_COMPUTE_UNUSED(result_offset_after_shift);
 
     Window window_bias = window;
+    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
     window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
     window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
     window_bias.set(3, Window::Dimension(0, 0, 0));
 
-    Iterator in(input, window);
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / input->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(input, win);
     Iterator bi(bias, window_bias);
-    Iterator out(output, window);
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        // Get bias and pointer to input
-        const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
-        auto       v_in   = wrapper::vloadq(in_ptr);
+    Iterator out(output, win);
 
-        // Accumulate bias
-        if(has_bias)
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
-            const auto bias_ptr = reinterpret_cast<T *>(bi.ptr());
-            v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
+            auto       v_in   = wrapper::vloadq(in_ptr + x);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+            }
+
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+            wrapper::vstore(out_ptr + x, v_in);
         }
 
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-        wrapper::vstore(out_ptr, v_in);
+        // Left-overs loop
+        for(; x < window_end_x; ++x)
+        {
+            // Get bias and pointer to input
+            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
 
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                s_in += *bias_ptr;
+            }
+
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+            *(out_ptr + x)     = s_in;
+        }
     },
     in, bi, out);
 }
 
 // Quantized case
-template < typename TOut, bool has_bias, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
+template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
 void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
 {
     using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
     using TagType    = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
@@ -242,46 +227,76 @@ void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window
     const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
     const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
 
-    Iterator in(input, window);
-    Iterator out(output, window);
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / input->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    Iterator in(input, win);
+    Iterator out(output, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
     {
-        // Get bias and pointer to input
-        const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr());
-        int32x4x4_t v_in =
-        {
-            {
-                wrapper::vloadq(in_ptr),
-                wrapper::vloadq(in_ptr + 4),
-                wrapper::vloadq(in_ptr + 8),
-                wrapper::vloadq(in_ptr + 12)
-            }
-        };
 
-        // Accumulate bias
-        if(has_bias)
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
-            const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
-            v_in =
+            // Get bias and pointer to input
+            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+            int32x4x4_t v_in =
             {
                 {
-                    wrapper::vadd(v_in.val[0], vb),
-                    wrapper::vadd(v_in.val[1], vb),
-                    wrapper::vadd(v_in.val[2], vb),
-                    wrapper::vadd(v_in.val[3], vb)
+                    wrapper::vloadq(in_ptr),
+                    wrapper::vloadq(in_ptr + 4),
+                    wrapper::vloadq(in_ptr + 8),
+                    wrapper::vloadq(in_ptr + 12)
                 }
             };
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
+                v_in =
+                {
+                    {
+                        wrapper::vadd(v_in.val[0], vb),
+                        wrapper::vadd(v_in.val[1], vb),
+                        wrapper::vadd(v_in.val[2], vb),
+                        wrapper::vadd(v_in.val[3], vb)
+                    }
+                };
+            }
+
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
+                                                           min, max, false));
         }
 
-        const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-        wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
+        // Left-overs loop
+        for(; x < window_end_x; ++x)
+        {
+            // Get bias and pointer to input
+            int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
+                s_in += b;
+            }
+
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+        }
     },
     in, out);
 }
-template < typename TOut, bool has_bias, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
+template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
 void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
 {
     using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
     using TagType    = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
@@ -292,41 +307,71 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window
     const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
 
     Window window_bias = window;
+    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
     window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
     window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
     window_bias.set(3, Window::Dimension(0, 0, 0));
 
-    Iterator in(input, window);
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / input->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(input, win);
     Iterator bi(bias, window_bias);
+    Iterator out(output, win);
 
-    Iterator out(output, window);
-    execute_window_loop(window, [&](const Coordinates &)
+    execute_window_loop(win, [&](const Coordinates &)
     {
-        // Get bias and pointer to input
-        const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr());
-        int32x4x4_t v_in =
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
+            // Get bias and pointer to input
+            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+            int32x4x4_t v_in =
             {
-                wrapper::vloadq(in_ptr),
-                wrapper::vloadq(in_ptr + 4),
-                wrapper::vloadq(in_ptr + 8),
-                wrapper::vloadq(in_ptr + 12),
+                {
+                    wrapper::vloadq(in_ptr),
+                    wrapper::vloadq(in_ptr + 4),
+                    wrapper::vloadq(in_ptr + 8),
+                    wrapper::vloadq(in_ptr + 12),
+                }
+            };
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+
+                wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
+                wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
+                wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
+                wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
             }
-        };
 
-        // Accumulate bias
-        if(has_bias)
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
+        }
+
+        // Left-overs loop
+        for(; x < window_end_x; ++x)
         {
-            const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr());
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+            int32_t    s_in   = *in_ptr;
 
-            wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
-            wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
-            wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
-            wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
-        }
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+                s_in += *bias_ptr;
+            }
 
-        const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-        wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+        }
     },
     in, bi, out);
 }
@@ -352,12 +397,30 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
     _result_shift                 = info.result_shift;
     _result_offset_after_shift    = info.result_offset_after_shift;
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info(), info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    // Auto-initialize output output if required
+    if(output != nullptr && output->info() != nullptr)
+    {
+        // Work out expected output data type
+        const DataType output_dt = (input->info()->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_dt));
+    }
+
+    Window      win = calculate_max_window(*input->info(), Steps());
+    Coordinates coord;
+    coord.set_num_dimensions(input->info()->num_dimensions());
+
+    if(output != nullptr && (output->info()->total_size() != 0))
+    {
+        output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+    }
+    else
+    {
+        input->info()->set_valid_region(ValidRegion(coord, input->info()->tensor_shape()));
+    }
+
+    INEKernel::configure(win);
 
-    const bool has_bias          = bias != nullptr;
     const bool is_qasymm8_signed = (output != nullptr) ? is_data_type_quantized_asymmetric_signed(output->info()->data_type()) : false;
 
     // Set appropriate function
@@ -369,24 +432,24 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
             {
                 if(is_qasymm8_signed)
                 {
-                    _func = (has_bias) ? &output_stage_nchw<int8_t, true> : &output_stage_nchw<int8_t, false>;
+                    _func = &output_stage_nchw<int8_t>;
                 }
                 else
                 {
-                    _func = (has_bias) ? &output_stage_nchw<uint8_t, true> : &output_stage_nchw<uint8_t, false>;
+                    _func = &output_stage_nchw<uint8_t>;
                 }
                 break;
             }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
-                _func = (has_bias) ? &output_stage_nchw<float16_t, true> : &output_stage_nchw<float16_t, false>;
+                _func = &output_stage_nchw<float16_t>;
                 break;
             }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::F32:
             {
-                _func = (has_bias) ? &output_stage_nchw<float, true> : &output_stage_nchw<float, false>;
+                _func = &output_stage_nchw<float>;
                 break;
             }
             default:
@@ -403,24 +466,24 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
             {
                 if(is_qasymm8_signed)
                 {
-                    _func = (has_bias) ? &output_stage_nhwc<int8_t, true> : &output_stage_nhwc<int8_t, false>;
+                    _func = &output_stage_nhwc<int8_t>;
                 }
                 else
                 {
-                    _func = (has_bias) ? &output_stage_nhwc<uint8_t, true> : &output_stage_nhwc<uint8_t, false>;
+                    _func = &output_stage_nhwc<uint8_t>;
                 }
                 break;
             }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
-                _func = (has_bias) ? &output_stage_nhwc<float16_t, true> : &output_stage_nhwc<float16_t, false>;
+                _func = &output_stage_nhwc<float16_t>;
                 break;
             }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::F32:
             {
-                _func = (has_bias) ? &output_stage_nhwc<float, true> : &output_stage_nhwc<float, false>;
+                _func = &output_stage_nhwc<float>;
                 break;
             }
             default:
@@ -435,11 +498,6 @@ Status NEDirectConvolutionLayerOutputStageKernel::validate(const ITensorInfo *in
                                                            const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              bias == nullptr ? nullptr : bias->clone().get(),
-                                                              output == nullptr ? nullptr : output->clone().get(),
-                                                              info)
-                                .first);
 
     return Status{};
 }
@@ -451,6 +509,7 @@ void NEDirectConvolutionLayerOutputStageKernel::run(const Window &window, const
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
+    const bool has_bias = _bias != nullptr;
+    (*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, has_bias);
 }
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
rename to src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
index 165f5bd133..b1b88103bf 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -87,7 +87,7 @@ class NEDirectConvolutionLayerOutputStageKernel : public INEKernel
 
 private:
     using OutputStageKernel = void(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
+                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias);
 
 private:
     OutputStageKernel *_func;
diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
index 014a564bf1..412ae247cb 100644
--- a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
+#include "src/core/NEON/kernels/NEElementwiseOperationKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <map>
@@ -142,6 +144,14 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar
         case ArithmeticOperation::DIV:
         {
             res = a / b;
+            if(std::is_integral<ScalarType>::value)
+            {
+                res = (b == 0) ? 0 : res;
+                if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
+                {
+                    --res;
+                }
+            }
             break;
         }
         case ArithmeticOperation::POWER:
@@ -207,6 +217,12 @@ inline typename VectorType::type elementwise_arithm_op(const typename VectorType
     return res;
 }
 
+template <>
+inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b)
+{
+    return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
+}
+
 template <>
 inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
 {
@@ -445,6 +461,21 @@ inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_star
     return x;
 }
 
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x,
+                                      const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a   = wrapper::vloadq(input1_ptr + x);
+        const auto b   = wrapper::vloadq(input2_ptr + x);
+        const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);
+        wrapper::vstore(output_ptr + x, res);
+    }
+    return x;
+}
+
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
 inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
                                        const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
@@ -525,6 +556,19 @@ inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int win
     return x;
 }
 
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x,
+                                                const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+{
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        wrapper::vstore(output_ptr + x, a);
+    }
+    return x;
+}
+
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
 inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
                                                  const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
@@ -612,7 +656,7 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const
     const int  window_step_x         = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
     const auto window_start_x        = static_cast<int>(window.x().start());
     const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     if(is_broadcast_across_x)
     {
@@ -691,7 +735,7 @@ void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *o
     const int  window_step_x         = 16;
     const auto window_start_x        = static_cast<int>(window.x().start());
     const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
 
@@ -799,7 +843,7 @@ void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, I
     const int  window_step_x         = 16;
     const auto window_start_x        = static_cast<int>(window.x().start());
     const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
 
@@ -906,7 +950,7 @@ void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITe
     const int  window_step_x         = 16;
     const auto window_start_x        = static_cast<int>(window.x().start());
     const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
 
@@ -994,6 +1038,15 @@ void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITe
     }
 }
 
+template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
+void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
+                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
+                                                              &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
+                                                              &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
+}
+
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
 void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
@@ -1101,6 +1154,7 @@ configure_comp_func(const ITensorInfo *input1, const ITensorInfo *input2, ITenso
 {
     static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function =
     {
+        { "op_U8_U8_U8", &elementwise_comp_op_8<op, uint8_t, uint8x16_t> },
         { "op_F32_F32_U8", &elementwise_comp_op_32<op, float, float32x4_t> },
         { "op_S16_S16_U8", &elementwise_comp_op_16<op, int16_t, int16x8_t> },
         { "op_S32_S32_U8", &elementwise_comp_op_32<op, int32_t, int32x4_t> },
@@ -1122,7 +1176,6 @@ NEElementwiseOperationKernel::NEElementwiseOperationKernel()
 
 Status NEElementwiseOperationKernel::validate_arguments_common(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
 
@@ -1194,6 +1247,7 @@ void NEArithmeticOperationKernel::configure(ArithmeticOperation op, const ITenso
 
 Status NEArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
     // Validate in case of configured output
     if(output.total_size() > 0)
     {
@@ -1221,7 +1275,7 @@ void NEDivisionOperationKernel::configure(const ITensorInfo *input1, const ITens
 
 Status NEDivisionOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::S32, DataType::F16, DataType::F32);
     return NEArithmeticOperationKernel::validate_arguments(input1, input2, output);
 }
 
@@ -1285,6 +1339,7 @@ void NEComparisonOperationKernel::configure(ComparisonOperation op, const ITenso
 
 Status NEComparisonOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
     // Validate in case of configured output
     if(output.total_size() > 0)
     {
diff --git a/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h b/src/core/NEON/kernels/NEElementwiseOperationKernel.h
similarity index 93%
rename from arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
rename to src/core/NEON/kernels/NEElementwiseOperationKernel.h
index 47b8c3b7c8..b0037d357f 100644
--- a/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
+++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H
 #define ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -97,7 +97,7 @@ class NEArithmeticOperationKernel : public NEElementwiseOperationKernel
     /** Default constructor */
     NEArithmeticOperationKernel() = default;
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
+    /** Configure kernel
      *
      * @param[in]  op     Arithmetic operation to be executed.
      * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
@@ -128,17 +128,17 @@ class NEDivisionOperationKernel : public NEArithmeticOperationKernel
     /** Default constructor */
     NEDivisionOperationKernel() = default;
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
+    /** Configure kernel
      *
-     * @param[in]  input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in]  input1 First tensor input info. Data types supported: S32/F16/F32.
      * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
      * @param[out] output Output tensor info. Data types supported: Same as @p input1.
      */
     void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDivisionOperationKernel
      *
-     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in] input1 First tensor input info. Data types supported: S32/F16/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
      * @param[in] output Output tensor info. Data types supported: Same as @p input1.
      *
@@ -157,7 +157,7 @@ class NEPowerOperationKernel : public NEArithmeticOperationKernel
     /** Default constructor */
     NEPowerOperationKernel() = default;
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
+    /** Configure kernel
      *
      * @param[in]  input1 First tensor input info. Data types supported: F16/F32.
      * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
@@ -165,7 +165,7 @@ class NEPowerOperationKernel : public NEArithmeticOperationKernel
      */
     void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPowerOperationKernel
      *
      * @param[in] input1 First tensor input info. Data types supported: F16/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
@@ -186,7 +186,7 @@ class NEComparisonOperationKernel : public NEElementwiseOperationKernel
     /** Default constructor */
     NEComparisonOperationKernel() = default;
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
+    /** Configure kernel
      *
      * @param[in]  op     Comparison operation to be executed.
      * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
diff --git a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
index 747bd41dc0..d899643fdc 100644
--- a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
+#include "src/core/NEON/kernels/NEElementwiseUnaryKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h b/src/core/NEON/kernels/NEElementwiseUnaryKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h
rename to src/core/NEON/kernels/NEElementwiseUnaryKernel.h
index 7f9d7ad114..fcf0aa51c5 100644
--- a/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h
+++ b/src/core/NEON/kernels/NEElementwiseUnaryKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NEELEMENTWISEUNARYKERNEL_H
 #define ARM_COMPUTE_NEELEMENTWISEUNARYKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp
index 4b93c3b4d1..171a6c828f 100644
--- a/src/core/NEON/kernels/NEErodeKernel.cpp
+++ b/src/core/NEON/kernels/NEErodeKernel.cpp
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
+#include "src/core/NEON/kernels/NEErodeKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEErodeKernel.h b/src/core/NEON/kernels/NEErodeKernel.h
similarity index 74%
rename from arm_compute/core/NEON/kernels/NEErodeKernel.h
rename to src/core/NEON/kernels/NEErodeKernel.h
index 140481df17..54f286780b 100644
--- a/arm_compute/core/NEON/kernels/NEErodeKernel.h
+++ b/src/core/NEON/kernels/NEErodeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEERODEKERNEL_H
 #define ARM_COMPUTE_NEERODEKERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -38,6 +38,18 @@ class NEErodeKernel : public INESimpleKernel
     {
         return "NEErodeKernel";
     }
+    /** Default constructor */
+    NEErodeKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEErodeKernel(const NEErodeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEErodeKernel &operator=(const NEErodeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEErodeKernel(NEErodeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEErodeKernel &operator=(NEErodeKernel &&) = default;
+    /** Default destructor */
+    ~NEErodeKernel() = default;
     /** Set the source, destination and border mode of the kernel
      *
      * @param[in]  input            Source tensor. Data type supported: U8
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
index d5b20d278d..200ee6bf88 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <set>
 
diff --git a/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h
rename to src/core/NEON/kernels/NEFFTDigitReverseKernel.h
index f7dc0b1d16..f436c364b2 100644
--- a/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
index c041b4c56a..cb1391ab4e 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,22 +21,23 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cmath>
 #include <complex>
 #include <map>
 
-#include "arm_compute/core/NEON/wrapper/traits.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-
 namespace arm_compute
 {
 namespace
diff --git a/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h
rename to src/core/NEON/kernels/NEFFTRadixStageKernel.h
index 15663e7490..8a695b790f 100644
--- a/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <arm_neon.h>
 #include <set>
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
index ea2831f88d..6dc5541e94 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFFTScaleKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEFFTScaleKernel.h b/src/core/NEON/kernels/NEFFTScaleKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEFFTScaleKernel.h
rename to src/core/NEON/kernels/NEFFTScaleKernel.h
index c25ba323ab..24a19f98ba 100644
--- a/arm_compute/core/NEON/kernels/NEFFTScaleKernel.h
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H
 #define ARM_COMPUTE_NEFFTSCALEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include "arm_compute/core/KernelDescriptors.h"
 
diff --git a/src/core/NEON/kernels/NEFastCornersKernel.cpp b/src/core/NEON/kernels/NEFastCornersKernel.cpp
index 7b1d81e12c..c9280d8dc0 100644
--- a/src/core/NEON/kernels/NEFastCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEFastCornersKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
+#include "src/core/NEON/kernels/NEFastCornersKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/arm_compute/core/NEON/kernels/NEFastCornersKernel.h b/src/core/NEON/kernels/NEFastCornersKernel.h
similarity index 95%
rename from arm_compute/core/NEON/kernels/NEFastCornersKernel.h
rename to src/core/NEON/kernels/NEFastCornersKernel.h
index e4e87c032f..a4086afb0c 100644
--- a/arm_compute/core/NEON/kernels/NEFastCornersKernel.h
+++ b/src/core/NEON/kernels/NEFastCornersKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NEFASTCORNERSKERNEL_H
 #define ARM_COMPUTE_NEFASTCORNERSKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
 
@@ -52,6 +52,8 @@ class NEFastCornersKernel : public INEKernel
     NEFastCornersKernel(NEFastCornersKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEFastCornersKernel &operator=(NEFastCornersKernel &&) = default;
+    /** Default destructor */
+    ~NEFastCornersKernel() = default;
     /** Initialise the kernel.
      *
      * @param[in]  input               Source image. Data type supported: U8.
diff --git a/src/core/NEON/kernels/NEFillArrayKernel.cpp b/src/core/NEON/kernels/NEFillArrayKernel.cpp
index 6b22dadd08..e8ae926fbf 100644
--- a/src/core/NEON/kernels/NEFillArrayKernel.cpp
+++ b/src/core/NEON/kernels/NEFillArrayKernel.cpp
@@ -21,13 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
+#include "src/core/NEON/kernels/NEFillArrayKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
diff --git a/arm_compute/core/NEON/kernels/NEFillArrayKernel.h b/src/core/NEON/kernels/NEFillArrayKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEFillArrayKernel.h
rename to src/core/NEON/kernels/NEFillArrayKernel.h
index 99df8795ae..c9841679d1 100644
--- a/arm_compute/core/NEON/kernels/NEFillArrayKernel.h
+++ b/src/core/NEON/kernels/NEFillArrayKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,8 @@
 #define ARM_COMPUTE_NEFILLARRAYKERNEL_H
 
 #include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index dbaec83d04..488079062b 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <cstdint>
diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEFillBorderKernel.h
rename to src/core/NEON/kernels/NEFillBorderKernel.h
index 071843d114..65908bebee 100644
--- a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
+++ b/src/core/NEON/kernels/NEFillBorderKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_NEFILLBORDERKERNEL_H
 #define ARM_COMPUTE_NEFILLBORDERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
index 35ebc5b70b..8c0dc10ee8 100644
--- a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
@@ -21,15 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
diff --git a/arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h b/src/core/NEON/kernels/NEFlattenLayerKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h
rename to src/core/NEON/kernels/NEFlattenLayerKernel.h
index dbd24129f1..5fd5f436b2 100644
--- a/arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h
+++ b/src/core/NEON/kernels/NEFlattenLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEFLATTENLAYERKERNEL_H
 #define ARM_COMPUTE_NEFLATTENLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index f0781341d5..2750acdda7 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp
@@ -21,28 +21,70 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
+#include "src/core/NEON/kernels/NEFloorKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
-#include <arm_neon.h>
+#include "src/core/NEON/kernels/floor/impl/list.h"
+#include "src/core/common/Registrars.h"
 
 namespace arm_compute
 {
 namespace
 {
+struct FloorSelectorData
+{
+    DataType dt;
+};
+using FloorSelectorPtr = std::add_pointer<bool(const FloorSelectorData &data)>::type;
+using FloorUKernelPtr  = std::add_pointer<void(const void *, void *, int)>::type;
+
+struct FloorKernel
+{
+    const char            *name;
+    const FloorSelectorPtr is_selected;
+    FloorUKernelPtr        ukernel;
+};
+
+static const FloorKernel available_kernels[] =
+{
+    {
+        "fp16_neon_floor",
+        [](const FloorSelectorData & data) { return data.dt == DataType::F16; },
+        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)
+    },
+    {
+        "f32_neon_floor",
+        [](const FloorSelectorData & data) { return data.dt == DataType::F32; },
+        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)
+    },
+};
+
+const FloorKernel *get_implementation(const FloorSelectorData &data)
+{
+    for(const auto &uk : available_kernels)
+    {
+        if(uk.is_selected(data))
+        {
+            return &uk;
+        }
+    }
+    return nullptr;
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+    const auto *uk = get_implementation(FloorSelectorData{ input->data_type() });
+    ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     // Validate in case of configured output
     if(output->total_size() > 0)
@@ -90,66 +132,19 @@ void NEFloorKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const DataType data_type = _input->info()->data_type();
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16 / _input->info()->element_size();
-
     Window win{ window };
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const auto  len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
+    const auto *uk  = get_implementation(FloorSelectorData{ _input->info()->data_type() });
+
     Iterator input(_input, win);
     Iterator output(_output, win);
 
-    if(data_type == DataType::F32)
+    execute_window_loop(win, [&](const Coordinates &)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float32x4_t res = vfloorq_f32(vld1q_f32(input_ptr + x));
-                vst1q_f32(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = std::floor(*(input_ptr + x));
-            }
-        },
-        input, output);
-    }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else if(data_type == DataType::F16)
-    {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float16x8_t res = vfloorq_f16(vld1q_f16(input_ptr + x));
-                vst1q_f16(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = std::floor(*(input_ptr + x));
-            }
-        },
-        input, output);
-    }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else
-    {
-        ARM_COMPUTE_ERROR("Invalid data type!");
-    }
+        uk->ukernel(input.ptr(), output.ptr(), len);
+    },
+    input, output);
 }
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEFloorKernel.h b/src/core/NEON/kernels/NEFloorKernel.h
similarity index 75%
rename from arm_compute/core/NEON/kernels/NEFloorKernel.h
rename to src/core/NEON/kernels/NEFloorKernel.h
index 255b0d4fb9..99c016bac5 100644
--- a/arm_compute/core/NEON/kernels/NEFloorKernel.h
+++ b/src/core/NEON/kernels/NEFloorKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEFLOORKERNEL_H
 #define ARM_COMPUTE_NEFLOORKERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -38,6 +38,18 @@ class NEFloorKernel : public INESimpleKernel
     {
         return "NEFloorKernel";
     }
+    /** Constructor */
+    NEFloorKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFloorKernel(const NEFloorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFloorKernel &operator=(const NEFloorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFloorKernel(NEFloorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFloorKernel &operator=(NEFloorKernel &&) = default;
+    /** Default destructor */
+    ~NEFloorKernel() = default;
     /** Set the source, destination of the kernel
      *
      * @param[in]  input  Source tensor. Data type supported: F16/F32.
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
index 282b1a6f4d..99f830fe06 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
@@ -21,16 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <map>
 
diff --git a/arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
rename to src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
index ecb17f87a2..ee767b01c8 100644
--- a/arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEFUSEBATCHNORMALIZATIONKERNEL_H
 #define ARM_COMPUTE_NEFUSEBATCHNORMALIZATIONKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 3d178316c6..5d178ea85b 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,16 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
@@ -61,113 +63,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    unsigned int           num_elems_processed_per_iteration_x = (input->element_size() == 1) ? 8 : 4;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-    bool                   window_changed                      = false;
-
-    // Configure kernel window
-    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    window_changed = window_changed || update_window_and_padding(win, input_access);
-
-    // Configure window in case of configured output
-    if(output->total_size() != 0)
-    {
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f);
-        window_changed = window_changed || update_window_and_padding(win, output_access);
-        output_access.set_valid_region(win, input->valid_region());
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window)
-{
-    const size_t in_stride = input->info()->strides_in_bytes()[1];
-
-    // Set window for output tensor
-    Window win_out(window);
-    win_out.scale(Window::DimY, 0.25f);
-    Iterator in(input, window);
-
-    win_out.set_dimension_step(Window::DimX, 32);
-    Iterator out(output, win_out);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x8x4_t data =
-        {
-            {
-                vld1_u8(in.ptr() + 0 * in_stride),
-                vld1_u8(in.ptr() + 1 * in_stride),
-                vld1_u8(in.ptr() + 2 * in_stride),
-                vld1_u8(in.ptr() + 3 * in_stride),
-            }
-        };
-        vst4_u8(out.ptr(), data);
-    },
-    in, out);
-}
-
-void gemm_interleave_16bit_elements(const ITensor *input, ITensor *output, const Window &window)
-{
-    const size_t in_stride = input->info()->strides_in_bytes()[1];
-
-    // Set window for output tensor
-    Window win_out(window);
-    win_out.scale(Window::DimY, 0.25f);
-    Iterator in(input, window);
-
-    win_out.set_dimension_step(Window::DimX, 16);
-    Iterator out(output, win_out);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint16x4x4_t data =
-        {
-            {
-                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 0 * in_stride)),
-                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 1 * in_stride)),
-                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 2 * in_stride)),
-                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 3 * in_stride)),
-            }
-        };
-        vst4_u16(reinterpret_cast<uint16_t *>(out.ptr()), data);
-    },
-    in, out);
-}
-
-void gemm_interleave_32bit_elements(const ITensor *input, ITensor *output, const Window &window)
-{
-    const size_t in_stride = input->info()->strides_in_bytes()[1];
-
-    // Set window for output tensor
-    Window win_out(window);
-    win_out.scale(Window::DimY, 0.25f);
-    Iterator in(input, window);
-
-    win_out.set_dimension_step(Window::DimX, 16);
-    Iterator out(output, win_out);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint32x4x4_t data =
-        {
-            {
-                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 0 * in_stride)),
-                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 1 * in_stride)),
-                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 2 * in_stride)),
-                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 3 * in_stride))
-            }
-        };
-        vst4q_u32(reinterpret_cast<uint32_t *>(out.ptr()), data);
-    },
-    in, out);
-}
 } // namespace
 
 NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel()
@@ -191,33 +86,92 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
     switch(input->info()->element_size())
     {
         case 1:
-            _func = &gemm_interleave_8bit_elements;
+            _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4<uint8_t>;
             break;
         case 2:
-            _func = &gemm_interleave_16bit_elements;
+            _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4<uint16_t>;
             break;
         case 4:
-            _func = &gemm_interleave_32bit_elements;
+            _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4<uint32_t>;
             break;
         default:
             ARM_COMPUTE_ERROR_ON("Element size not supported");
             break;
     }
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    Window win = calculate_max_window(*input->info(), Steps(1, 4));
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
 }
 
 Status NEGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
     return Status{};
 }
 
+template <typename ScalarType>
+void NEGEMMInterleave4x4Kernel::gemm_interleave4x4(const ITensor *input, ITensor *output, const Window &window)
+{
+    const size_t window_start_x = window.x().start();
+    const size_t window_end_x   = window.x().end();
+
+    const size_t in_height = input->info()->dimension(1);
+    const size_t in_stride = input->info()->strides_in_bytes()[1];
+
+    const size_t partial_y = in_height % 4;
+
+    // Set window for the input tensor
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Set window for the output tensor
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win_out.scale(Window::DimY, 0.25f);
+
+    Iterator in(input, win);
+    Iterator out(output, win_out);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        if(id.y() + 4 <= static_cast<int>(in_height))
+        {
+            for(size_t x = window_start_x; x < window_end_x; ++x)
+            {
+                const ScalarType data[4] =
+                {
+                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 0 * in_stride) + x),
+                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 1 * in_stride) + x),
+                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 2 * in_stride) + x),
+                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 3 * in_stride) + x),
+                };
+                std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType));
+            }
+        }
+        else
+        {
+            for(size_t x = window_start_x; x < window_end_x; ++x)
+            {
+                ScalarType data[4] = { 0, 0, 0, 0 };
+
+                for(size_t y = 0; y < partial_y; ++y)
+                {
+                    data[y] = *(reinterpret_cast<const ScalarType *>(in.ptr() + y * in_stride) + x);
+                }
+
+                std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType));
+            }
+        }
+    },
+    in, out);
+}
+
 void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -233,5 +187,5 @@ void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info
     *
     *         After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
     */
-    (*_func)(_input, _output, window);
+    (this->*_func)(_input, _output, window);
 }
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
similarity index 62%
rename from arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
rename to src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index 7ddbf4bca8..85939ebae9 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H
 #define ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -56,8 +56,18 @@ class NEGEMMInterleave4x4Kernel : public INESimpleKernel
     {
         return "NEGEMMInterleave4x4Kernel";
     }
-    /* Constructor */
+    /** Constructor */
     NEGEMMInterleave4x4Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMInterleave4x4Kernel(const NEGEMMInterleave4x4Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMInterleave4x4Kernel &operator=(const NEGEMMInterleave4x4Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMInterleave4x4Kernel(NEGEMMInterleave4x4Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMInterleave4x4Kernel &operator=(NEGEMMInterleave4x4Kernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMInterleave4x4Kernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @param[in]  input  Input tensor. Data types supported: All
@@ -77,15 +87,26 @@ class NEGEMMInterleave4x4Kernel : public INESimpleKernel
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    /** Common signature for all the transpose functions
+    /** Template function to run gemm interleave 4x4
      *
-     * @param[in]  input  An input tensor. Data types supported: All
-     * @param[out] output The output tensor. Data type supported: same as @p input
-     * @param[in]  window Region on which to execute the kernel.
+     * @tparam ScalarType Scalar datatype
+     *
+     * @param[in]  input  Input tensor. Data types supported: uint32_t, uint16_t and uint8_t
+     * @param[out] output Output tensor. Data types supported: uint32_t, uint16_t and uint8_t
+     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename ScalarType>
+    void gemm_interleave4x4(const ITensor *input, ITensor *output, const Window &window);
+
+    /** Common signature for all the specialised gemm interleave 4x4 functions
+     *
+     * @param[in]  input  Input tensor. Data types supported: uint32_t, uint16_t and uint8_t
+     * @param[out] output Output tensor. Data types supported: uint32_t, uint16_t and uint8_t
+     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
-    using GEMMInterleaveFunction = void(const ITensor *input, ITensor *output, const Window &window);
+    using GEMMInterleaveFunctionFuncPtr = void (NEGEMMInterleave4x4Kernel::*)(const ITensor *input, ITensor *output, const Window &window);
 
-    GEMMInterleaveFunction *_func; /**< GEMM interleave function to use for the particular tensor types passed to configure() */
+    GEMMInterleaveFunctionFuncPtr _func;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H*/
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index c5d7f10e55..4dbfc3b022 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -21,9 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -32,11 +31,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
 
 using namespace arm_compute;
 
@@ -44,7 +43,7 @@ namespace arm_compute
 {
 namespace
 {
-void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, size_t stride_b, const Window &window)
+void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
 {
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -253,15 +252,29 @@ void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &ou
         }
 
         auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
-        vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
-        vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
-        vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
-        vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
+        if(id.x() < (width_out - 16))
+        {
+            vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
+            vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
+            vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
+            vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
+        }
+        else
+        {
+            auto left_over = width_out - id.x();
+            for(auto k = 0; k < 4 && left_over; ++k)
+            {
+                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                {
+                    *(vec_out + k * 4 + j) = c0.val[k][j];
+                }
+            }
+        }
     },
     ina, inb, out);
 }
 
-void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, size_t stride_b, const Window &window)
+void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
 {
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -469,17 +482,34 @@ void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &ou
         }
 
         auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
-        vst1q_s32(vec_out + 0, c0.val[0]);
-        vst1q_s32(vec_out + 4, c0.val[1]);
-        vst1q_s32(vec_out + 8, c0.val[2]);
-        vst1q_s32(vec_out + 12, c0.val[3]);
+        if(id.x() < (width_out - 16))
+        {
+            vst1q_s32(vec_out + 0, c0.val[0]);
+            vst1q_s32(vec_out + 4, c0.val[1]);
+            vst1q_s32(vec_out + 8, c0.val[2]);
+            vst1q_s32(vec_out + 12, c0.val[3]);
+        }
+        else
+        {
+            auto left_over = width_out - id.x();
+            for(auto k = 0; k < 4 && left_over; ++k)
+            {
+                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                {
+                    *(vec_out + k * 4 + j) = c0.val[k][j];
+                }
+            }
+        }
     },
     ina, inb, out);
 }
 
-void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window)
+void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
 {
-    execute_window_loop(window, [&](const Coordinates &)
+    const auto   width_out  = static_cast<int>(out_info.dimension(0));
+    const auto   height_out = static_cast<int>(out_info.dimension(1));
+    const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
+    execute_window_loop(window, [&](const Coordinates & id)
     {
         const uint8_t *mtx_a0 = ina.ptr();
         const uint8_t *mtx_b0 = inb.ptr();
@@ -574,32 +604,93 @@ void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int
         }
 
         auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-        vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
-        vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
-        vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
-        vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
-        vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
-        vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
-        vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
-        vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
-        vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
-        vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
-        vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
-        vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
-        vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
-        vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
-        vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
-        vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
+
+        if(id.y() < height_out && id.x() < (width_out - 16))
+        {
+            vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
+            vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
+            vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
+            vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
+            if(id.y() + 1 < height_out)
+            {
+                vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
+                vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
+                vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
+                vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
+                if(id.y() + 2 < height_out)
+                {
+                    vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
+                    vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
+                    vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
+                    vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
+                    if(id.y() + 3 < height_out)
+                    {
+                        vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
+                        vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
+                        vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
+                        vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
+                    }
+                }
+            }
+        }
+        else
+        {
+            const auto left_over_value = width_out - id.x();
+            auto       left_over       = left_over_value;
+            for(auto k = 0; k < 4 && left_over; ++k)
+            {
+                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                {
+                    *(mtx_out + k * 4 + j) = c0.val[k][j];
+                }
+            }
+            if(id.y() + 1 < height_out)
+            {
+                left_over = left_over_value;
+                for(auto k = 0; k < 4 && left_over; ++k)
+                {
+                    for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
+                    }
+                }
+                if(id.y() + 2 < height_out)
+                {
+                    left_over = left_over_value;
+                    for(auto k = 0; k < 4 && left_over; ++k)
+                    {
+                        for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                        {
+                            *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                        }
+                    }
+                    if(id.y() + 3 < height_out)
+                    {
+                        left_over = left_over_value;
+                        for(auto k = 0; k < 4 && left_over; ++k)
+                        {
+                            for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                            {
+                                *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                            }
+                        }
+                    }
+                }
+            }
+        }
     },
     ina, inb, out);
 }
 
-void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window)
+void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
 {
+    const auto   width_out  = static_cast<int>(out_info.dimension(0));
+    const auto   height_out = static_cast<int>(out_info.dimension(1));
+    const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
     // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
     // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
     // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates &)
+    execute_window_loop(window, [&](const Coordinates & id)
     {
         auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
         auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
@@ -692,32 +783,86 @@ void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int
             c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
             c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
         }
-
         auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-        vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
-        vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
-        vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
-        vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
-        vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
-        vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
-        vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
-        vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
-        vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
-        vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
-        vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
-        vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
-        vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
-        vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
-        vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
-        vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
+        if(id.y() < height_out && id.x() < (width_out - 16))
+        {
+            vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
+            vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
+            vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
+            vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
+            if(id.y() + 1 < height_out)
+            {
+                vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
+                vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
+                vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
+                vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
+                if(id.y() + 2 < height_out)
+                {
+                    vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
+                    vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
+                    vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
+                    vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
+                    if(id.y() + 3 < height_out)
+                    {
+                        vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
+                        vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
+                        vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
+                        vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
+                    }
+                }
+            }
+        }
+        else if(id.y() < height_out)
+        {
+            const auto left_over_value = width_out - id.x();
+            auto       left_over       = left_over_value;
+            for(auto k = 0; k < 4 && left_over; ++k)
+            {
+                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                {
+                    *(mtx_out + k * 4 + j) = c0.val[k][j];
+                }
+            }
+            if(id.y() + 1 < height_out)
+            {
+                left_over = left_over_value;
+                for(auto k = 0; k < 4 && left_over; ++k)
+                {
+                    for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
+                    }
+                }
+                if(id.y() + 2 < height_out)
+                {
+                    left_over = left_over_value;
+                    for(auto k = 0; k < 4 && left_over; ++k)
+                    {
+                        for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                        {
+                            *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                        }
+                    }
+                    if(id.y() + 3 < height_out)
+                    {
+                        left_over = left_over_value;
+                        for(auto k = 0; k < 4 && left_over; ++k)
+                        {
+                            for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                            {
+                                *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
     },
     ina, inb, out);
 }
 } // namespace
 
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
 Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
@@ -748,50 +893,6 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1,
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
-{
-    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
-    Window win;
-    bool   window_changed = false;
-
-    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    if((output->dimension(1) == 1))
-    {
-        // Configure kernel window
-        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
-
-        // We cannot read out-of-bound elements from matrix A as we use the left-over for loop
-        AccessWindowStatic     in0_access(input0, 0, 0, input0->tensor_shape().x(), 1);
-        AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
-
-        window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
-
-        Coordinates coord;
-        coord.set_num_dimensions(output->num_dimensions());
-        output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
-    }
-    else
-    {
-        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        unsigned int num_k_iterations = ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x) / 16;
-        // For each iteration of "k" we increment the input pointer by 4, and we load 8 elements a the time:
-        AccessWindowStatic    in0_access(input0, 0, 0, (num_k_iterations - 1) * 4 + 8, input0->dimension(1));
-        AccessWindowStatic    in1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
-
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
@@ -812,16 +913,33 @@ void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITen
     _output         = output;
     _slide_matrix_b = in1_shape[2] != 1;
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+    Window win;
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if((output->info()->dimension(1) == 1))
+    {
+        // Configure kernel window
+        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->info()->num_dimensions());
+        output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+    }
+    else
+    {
+        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    }
+
+    INEKernel::configure(win);
 }
 
 Status NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
 
     return Status{};
 }
@@ -837,6 +955,7 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
     {
         const auto width_matrix_a = static_cast<int>(_input0->info()->dimension(0));
         const auto width_matrix_b = static_cast<int>(_input1->info()->dimension(0));
+        const auto width_out      = static_cast<int>(_output->info()->dimension(0));
         const auto in_b_stride    = static_cast<int>(_input1->info()->strides_in_bytes()[1] / data_size_from_type(_input1->info()->data_type()));
 
         // The implementation computes 16 elements per iteration
@@ -872,13 +991,13 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
             case DataType::S8:
             case DataType::QASYMM8_SIGNED:
             {
-                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window);
+                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
                 break;
             }
             case DataType::U8:
             case DataType::QASYMM8:
             {
-                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window);
+                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
                 break;
             }
             default:
@@ -891,7 +1010,7 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
     else
     {
         const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
-        const size_t out_stride  = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
+        const int    width_b     = _input1->info()->dimension(0);
 
         // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
         Window win_a(window);
@@ -914,19 +1033,18 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
         Iterator inb(_input1, win_b);
         Iterator out(_output, window);
 
-        const int width_b = _input1->info()->dimension(0);
         switch(_input0->info()->data_type())
         {
             case DataType::S8:
             case DataType::QASYMM8_SIGNED:
             {
-                matrix_multiply_s8(ina, inb, out, width_b, out_stride, window);
+                matrix_multiply_s8(ina, inb, out, width_b, *_output->info(), window);
                 break;
             }
             case DataType::U8:
             case DataType::QASYMM8:
             {
-                matrix_multiply_u8(ina, inb, out, width_b, out_stride, window);
+                matrix_multiply_u8(ina, inb, out, width_b, *_output->info(), window);
                 break;
             }
             default:
@@ -937,3 +1055,4 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
         }
     }
 }
+} // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
rename to src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
index 856cdf42e7..14d03fe3eb 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H
 #define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -57,6 +57,8 @@ class NEGEMMLowpMatrixMultiplyKernel : public INEKernel
     NEGEMMLowpMatrixMultiplyKernel(NEGEMMLowpMatrixMultiplyKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEGEMMLowpMatrixMultiplyKernel &operator=(NEGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpMatrixMultiplyKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index db6cb10995..174a06955f 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -21,9 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -32,18 +31,14 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
 
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
 Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
@@ -96,42 +91,22 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row,
-                                                        int32_t a_offset, int32_t b_offset)
-{
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    bool                   window_changed                    = false;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win, mm_result_access);
-
-    if(a_offset != 0)
-    {
-        AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
-    }
-    if(b_offset != 0)
-    {
-        AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
-        window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
 void run_offset_contribution(const Window &window,
                              ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row,
                              int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d)
 {
     Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
+    collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
     const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
 
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16;
+
+    Iterator mm_result_it(mm_result, collapsed_window);
+
     if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
     {
         // Set window for vector_sum_col
@@ -147,7 +122,6 @@ void run_offset_contribution(const Window &window,
 
         Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
         Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
-        Iterator mm_result_it(mm_result, window);
 
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
@@ -156,67 +130,86 @@ void run_offset_contribution(const Window &window,
 
         execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
-            const int  batch_id           = id.z() / depth_input;
-            const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-
-            // Compute the leftover term due to a_offset.
-            int32x4x4_t a_offset_term_s32 =
-            {
-                {
-                    vld1q_s32(vector_sum_col_ptr + 0),
-                    vld1q_s32(vector_sum_col_ptr + 4),
-                    vld1q_s32(vector_sum_col_ptr + 8),
-                    vld1q_s32(vector_sum_col_ptr + 12)
-                }
-            };
-
-            a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-            a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-            a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-            a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+            const int batch_id           = id.z() / depth_input;
+            auto      vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+            auto      mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
 
             // Compute the leftover term due to b_offset.
-            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y()
-                                                        + (id.z() % depth_input) * height_input);
-            b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset);
+            int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
+            b_offset_term_s32 *= b_offset;
+
+            const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
 
-            // Add a_offset_term_s32 and b_offset_term_s32
-            int32x4x4_t offset_term_s32 =
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
             {
+                // Compute the leftover term due to a_offset.
+                int32x4x4_t a_offset_term_s32 =
                 {
-                    vdupq_n_s32(k_offset),
-                    vdupq_n_s32(k_offset),
-                    vdupq_n_s32(k_offset),
-                    vdupq_n_s32(k_offset)
-                }
-            };
-
-            offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32));
-            offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32));
-            offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32));
-            offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32));
-
-            int32x4x4_t in_s32 =
-            {
+                    {
+                        vld1q_s32(vector_sum_col_ptr + x + 0),
+                        vld1q_s32(vector_sum_col_ptr + x + 4),
+                        vld1q_s32(vector_sum_col_ptr + x + 8),
+                        vld1q_s32(vector_sum_col_ptr + x + 12)
+                    }
+                };
+
+                a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+                a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+                a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+                a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+                // Add a_offset_term_s32 and b_offset_term_s32
+                int32x4x4_t offset_term_s32 =
+                {
+                    {
+                        vdupq_n_s32(k_offset),
+                        vdupq_n_s32(k_offset),
+                        vdupq_n_s32(k_offset),
+                        vdupq_n_s32(k_offset)
+                    }
+                };
+
+                offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
+                offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
+                offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
+                offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
+
+                int32x4x4_t in_s32 =
                 {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 12)
-                }
-            };
-
-            // Add the offset terms to GEMM's result
-            in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
-            in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
-            in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
-            in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
-
-            // Store the result with the offset contribution
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 0, in_s32.val[0]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 4, in_s32.val[1]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 8, in_s32.val[2]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 12, in_s32.val[3]);
+                    {
+                        vld1q_s32(mm_result_ptr + x + 0),
+                        vld1q_s32(mm_result_ptr + x + 4),
+                        vld1q_s32(mm_result_ptr + x + 8),
+                        vld1q_s32(mm_result_ptr + x + 12)
+                    }
+                };
+
+                // Add the offset terms to GEMM's result
+                in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
+                in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
+                in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
+                in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
+
+                // Store the result with the offset contribution
+                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+            }
+
+            // Left-overs loop
+            for(; x < window_end_x; ++x)
+            {
+                // Compute the leftover term due to a_offset.
+                int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
+
+                a_offset_term_s32 *= a_offset;
+
+                // Add the offset terms to GEMM's result
+                // Store the result with the offset contribution
+                mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
+            }
         },
         vector_sum_col_it, vector_sum_row_it, mm_result_it);
     }
@@ -231,40 +224,53 @@ void run_offset_contribution(const Window &window,
         win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
         Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
-        Iterator mm_result_it(mm_result, window);
 
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
-            const int batch_id = id.z() / depth_input;
+            const int batch_id      = id.z() / depth_input;
+            auto      mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
 
             // Compute the leftover term due to b_offset.
-            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y()
-                                                        + (id.z() % depth_input) * height_input);
-            b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset);
+            int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
+            b_offset_term_s32 *= b_offset;
 
-            int32x4x4_t in_s32 =
+            const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
             {
+                int32x4x4_t in_s32 =
                 {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 12)
-                }
-            };
-
-            // Add the offset terms to GEMM's result
-            in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32);
-            in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32);
-            in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32);
-            in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32);
-
-            // Store the result with the offset contribution
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 0, in_s32.val[0]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 4, in_s32.val[1]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 8, in_s32.val[2]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 12, in_s32.val[3]);
+                    {
+                        vld1q_s32(mm_result_ptr + x + 0),
+                        vld1q_s32(mm_result_ptr + x + 4),
+                        vld1q_s32(mm_result_ptr + x + 8),
+                        vld1q_s32(mm_result_ptr + x + 12)
+                    }
+                };
+
+                // Add the offset terms to GEMM's result
+                in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
+                in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
+                in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
+                in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
+
+                // Store the result with the offset contribution
+                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+            }
+
+            // Left-overs loop
+            for(; x < window_end_x; ++x)
+            {
+                // Add the offset terms to GEMM's result
+                // Store the result with the offset contribution
+                mm_result_ptr[x] += b_offset_term_s32;
+            }
         },
         vector_sum_row_it, mm_result_it);
     }
@@ -276,53 +282,68 @@ void run_offset_contribution(const Window &window,
         win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
         Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
-        Iterator mm_result_it(mm_result, window);
 
         // Offset in case vector_sum_col is batched
         const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
 
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
-            const int  batch_id           = id.z() / depth_input;
-            const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+            const int batch_id           = id.z() / depth_input;
+            auto      vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+            auto      mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
 
-            // Compute the leftover term due to a_offset.
-            int32x4x4_t a_offset_term_s32 =
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
             {
+                // Compute the leftover term due to a_offset.
+                int32x4x4_t a_offset_term_s32 =
                 {
-                    vld1q_s32(vector_sum_col_ptr + 0),
-                    vld1q_s32(vector_sum_col_ptr + 4),
-                    vld1q_s32(vector_sum_col_ptr + 8),
-                    vld1q_s32(vector_sum_col_ptr + 12)
-                }
-            };
-
-            a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-            a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-            a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-            a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-
-            int32x4x4_t in_s32 =
-            {
+                    {
+                        vld1q_s32(vector_sum_col_ptr + x + 0),
+                        vld1q_s32(vector_sum_col_ptr + x + 4),
+                        vld1q_s32(vector_sum_col_ptr + x + 8),
+                        vld1q_s32(vector_sum_col_ptr + x + 12)
+                    }
+                };
+
+                a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+                a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+                a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+                a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+                int32x4x4_t in_s32 =
                 {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 12)
-                }
-            };
-
-            // Add the offset terms to GEMM's result
-            in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
-            in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
-            in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
-            in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
-
-            // Store the result with the offset contribution
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 0, in_s32.val[0]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 4, in_s32.val[1]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 8, in_s32.val[2]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 12, in_s32.val[3]);
+                    {
+                        vld1q_s32(mm_result_ptr + x + 0),
+                        vld1q_s32(mm_result_ptr + x + 4),
+                        vld1q_s32(mm_result_ptr + x + 8),
+                        vld1q_s32(mm_result_ptr + x + 12)
+                    }
+                };
+
+                // Add the offset terms to GEMM's result
+                in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
+                in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
+                in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
+                in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
+
+                // Store the result with the offset contribution
+                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+            }
+
+            // Left-overs loop
+            for(; x < window_end_x; ++x)
+            {
+                // Compute the leftover term due to a_offset.
+                const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
+
+                // Add the offset terms to GEMM's result
+                // Store the result with the offset contribution
+                mm_result_ptr[x] += a_offset_term_s32 * a_offset;
+            }
         },
         vector_sum_col_it, mm_result_it);
     }
@@ -365,23 +386,17 @@ void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITe
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(mm_result->info(),
-                                                    vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
-                                                    vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
-                                                    a_offset, b_offset);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    Window      win = calculate_max_window(*mm_result->info(), Steps());
+    Coordinates coord;
+    coord.set_num_dimensions(mm_result->info()->num_dimensions());
+    mm_result->info()->set_valid_region(ValidRegion(coord, mm_result->info()->tensor_shape()));
+    INEKernel::configure(win);
 }
 
 Status NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
                                                     int32_t a_offset, int32_t b_offset)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              a_offset, b_offset)
-                                .first); // NOLINT
 
     return Status{};
 }
@@ -399,3 +414,4 @@ void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadI
 
     run_offset_contribution(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d);
 }
+} // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
rename to src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
index 5ce8403d3b..0f37e584b9 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
 #define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -60,6 +60,8 @@ class NEGEMMLowpOffsetContributionKernel : public INEKernel
     NEGEMMLowpOffsetContributionKernel(NEGEMMLowpOffsetContributionKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEGEMMLowpOffsetContributionKernel &operator=(NEGEMMLowpOffsetContributionKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpOffsetContributionKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @param[in, out] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index e9332b2cb6..3c8f5ae022 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -21,19 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
rename to src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
index 4db0872166..4c68fb0943 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
 #define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -76,6 +76,8 @@ class NEGEMMLowpOffsetContributionOutputStageKernel : public INEKernel
     NEGEMMLowpOffsetContributionOutputStageKernel(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEGEMMLowpOffsetContributionOutputStageKernel &operator=(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpOffsetContributionOutputStageKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @param[in]  mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
index 458b94b93c..2e78107a1a 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
@@ -21,18 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
rename to src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
index 4e0c8f8fb8..42ef570f77 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
 #define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -62,6 +62,8 @@ class NEGEMMLowpQuantizeDownInt32ScaleKernel : public INEKernel
     NEGEMMLowpQuantizeDownInt32ScaleKernel(NEGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEGEMMLowpQuantizeDownInt32ScaleKernel &operator=(NEGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpQuantizeDownInt32ScaleKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @param[in]  input        Input tensor. Data type supported: S32
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
index 44d55652a3..1fafc62302 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
@@ -21,19 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NESymm.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
rename to src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index d26c778e74..d04e713cb8 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
 #define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -59,6 +59,8 @@ class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public INEKern
     NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @param[in]  input                        Input tensor. Data type supported: S32
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
index a0a5c5d79f..bf9ce9554d 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
@@ -21,19 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
rename to src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index f1661680d0..55c07fbb5a 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
 #define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -60,6 +60,8 @@ class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public INEKerne
     NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @param[in]  input                        Input tensor. Data type supported: S32
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index a926903598..cbb56da8c0 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -21,19 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
rename to src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index 94ca617466..1a8de1c441 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
 #define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -60,6 +60,8 @@ class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public INEKern
     NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @param[in]  input                        Input tensor. Data type supported: S32
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 29453072a1..db038e559e 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
@@ -45,26 +47,6 @@ Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITe
     }
     return Status{};
 }
-std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped)
-{
-    const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1;
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, TensorShape(input->dimension(1)), 1, DataType::S32);
-
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowStatic     input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
 Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -77,31 +59,10 @@ Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITe
     }
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)
-{
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, TensorShape(input->dimension(0)), 1, DataType::S32);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowStatic     input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
-    : _input(), _output(), _k(0), _is_reshaped(false), _scalar(0), _mul_by_scalar(false)
+    : _input(), _output(), _k(0), _scalar(0), _mul_by_scalar(false)
 {
 }
 
@@ -109,26 +70,27 @@ void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
+    ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
-
     _input         = mtx_a;
     _output        = vector_sum_row;
     _k             = info.k;
-    _is_reshaped   = info.is_reshaped;
     _scalar        = info.scalar;
     _mul_by_scalar = info.mul_by_scalar;
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(1)), 1, DataType::S32);
+
+    Window win = calculate_max_window(*_output->info(), Steps(1));
+    _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
 }
 
 Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
 {
+    ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), info.is_reshaped).first);
-
     return Status{};
 }
 
@@ -149,121 +111,55 @@ void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &w
     Iterator in(_input, win_input);
     Iterator out(_output, collapsed_window);
 
-    const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
-
-    if(_is_reshaped)
+    execute_window_loop(collapsed_window, [&](const Coordinates & id)
     {
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            auto sum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+        auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+        TAcc sum_row  = 0;
 
-            const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
+        const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
 
 #if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
+        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
 #endif /* __arm__ */
 
-            int i = 0;
-            // This for loop performs 4 accumulations
-            for(; i <= (_k - 4); i += 4)
-            {
-                const auto a0_d8 = wrapper::vloadq(matrix_a + i * 4);
-
-                // Convert 8-bit to 16-bit
-                typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W64>::type a0_d16[4] =
-                {
-                    wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
-                    wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
-                    wrapper::vgetlow(wrapper::vmovl((wrapper::vgethigh(a0_d8)))),
-                    wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a0_d8)))
-                };
-
-                // Accumulate to 16-bit
-                a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[1]);
-                a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[2]);
-                a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[3]);
-
-                // Accumulate to 32-bit
-                sum_row = wrapper::vaddw(sum_row, a0_d16[0]);
-            }
-
-            // This for loop performs the leftover accumulations
-            for(; i < _k; ++i)
-            {
-                const auto a0_d8 = wrapper::vload(matrix_a + i * 4);
-
-                // Convert U8 to U16
-                const auto a0_d16 = wrapper::vgetlow(wrapper::vmovl(a0_d8));
-
-                // Accumulate to U32
-                sum_row = wrapper::vaddw(sum_row, a0_d16);
-            }
-
-            // Multiply by scalar if necessary
-            if(_mul_by_scalar)
-            {
-                sum_row = wrapper::vmul(sum_row, vec_scalar);
-            }
-
-            auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
-
-            wrapper::vstore(vector_sum_row, wrapper::vreinterpret(sum_row));
-        },
-        in, out);
-    }
-    else // it is not reshaped
-    {
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
+        int i = 0;
+        // This for loop performs 16 accumulations
+        for(; i <= (_k - 16); i += 16)
         {
-            // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
-            TAcc sum_row  = 0;
-
-            const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
-#endif /* __arm__ */
+            const auto a0_d8 = wrapper::vloadq(matrix_a + i);
 
-            int i = 0;
-            // This for loop performs 16 accumulations
-            for(; i <= (_k - 16); i += 16)
-            {
-                const auto a0_d8 = wrapper::vloadq(matrix_a + i);
+            // Partial accumulations in U16
+            const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
 
-                // Partial accumulations in U16
-                const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
+            // Accumulate to U32
+            vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
+        }
 
-                // Accumulate to U32
-                vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
-            }
-
-            // This for loop performs the leftover accumulations
-            for(; i < _k; ++i)
-            {
-                sum_row += static_cast<TAcc>(matrix_a[i]);
-            }
+        // This for loop performs the leftover accumulations
+        for(; i < _k; ++i)
+        {
+            sum_row += static_cast<TAcc>(matrix_a[i]);
+        }
 
 #if defined(__aarch64__)
-            // Reduction operation available on 64 bit architectures only
-            sum_row += wrapper::vaddv(vsum_row);
+        // Reduction operation available on 64 bit architectures only
+        sum_row += wrapper::vaddv(vsum_row);
 #else  // __aarch64__
-            auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
-            tmp      = wrapper::vpadd(tmp, tmp);
+        auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
+        tmp      = wrapper::vpadd(tmp, tmp);
 
-            sum_row += wrapper::vgetlane(tmp, 0);
+        sum_row += wrapper::vgetlane(tmp, 0);
 #endif // __aarch64__
 
-            // Multiply by scalar if necessary
-            if(_mul_by_scalar)
-            {
-                sum_row *= _scalar;
-            }
+        // Multiply by scalar if necessary
+        if(_mul_by_scalar)
+        {
+            sum_row *= _scalar;
+        }
 
-            *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
-        },
-        in, out);
-    }
+        *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
+    },
+    in, out);
 }
 
 void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
@@ -290,26 +186,32 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
 void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
+    ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
+
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
 
     _input         = mtx_b;
     _output        = vector_sum_col;
     _k             = info.k;
-    _is_reshaped   = info.is_reshaped;
     _scalar        = info.scalar;
     _mul_by_scalar = info.mul_by_scalar;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(0)), 1, DataType::S32);
+
+    // Configure kernel window
+    Window win = calculate_max_window_horizontal(*_output->info(), Steps(num_elems_processed_per_iteration));
+    _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
+    INEKernel::configure(win);
 }
 
 Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
 
     return Status{};
 }
@@ -321,198 +223,143 @@ void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const
     using TIAcc = wrapper::traits::promote_t<T>;
     using TAcc  = wrapper::traits::promote_t<TIAcc>;
 
-    Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
-
-    const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
-
-    if(_is_reshaped)
-    {
-        Window win_input(collapsed_window);
-        win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
-        win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Iterator in(_input, win_input);
-        Iterator out(_output, collapsed_window);
-
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
-            {
-                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
-            };
-
-            const auto *matrix_b = reinterpret_cast<const T *>(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+    Window     collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
+    const auto vec_scalar       = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
 
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
-#endif /* __arm__ */
+    const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
+    const auto in_b_stride    = static_cast<int>(_input->info()->strides_in_bytes()[1]);
 
-            int i = 0;
-            for(; i < _k; ++i)
-            {
-                const auto b0_b8 = wrapper::vloadq(matrix_b + i * 16);
+    // The implementation computes 16 elements per iteration
+    const int window_start_x = 16 * info.thread_id;
+    const int window_step_x  = 16 * info.num_threads;
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
 
-                // Convert 8bit to 16bit
-                const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2] =
-                {
-                    wrapper::vmovl(wrapper::vgetlow(b0_b8)),
-                    wrapper::vmovl(wrapper::vgethigh(b0_b8))
-                };
-
-                // Accumulate to U32
-                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
-                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
-                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
-                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
-            }
+    Window win_out(collapsed_window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
 
-            // Multiply by scalar if necessary
-            if(_mul_by_scalar)
-            {
-                sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
-                sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
-                sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
-                sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
-            }
+    Window win_in(win_out);
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
-            auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+    Iterator inb(_input, win_in);
+    Iterator out(_output, win_out);
 
-            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
-            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
-            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
-            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
-        },
-        in, out);
-    }
-    else // it is not reshaped
+    execute_window_loop(win_out, [&](const Coordinates & id)
     {
-        const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
-        const auto in_b_stride    = static_cast<int>(_input->info()->strides_in_bytes()[1]);
-
-        // The implementation computes 16 elements per iteration
-        const int window_start_x = 16 * info.thread_id;
-        const int window_step_x  = 16 * info.num_threads;
-        // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
-        const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-
-        Window win_out(collapsed_window);
-        win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
-
-        Window win_in(win_out);
-        win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-        win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Iterator inb(_input, win_in);
-        Iterator out(_output, win_out);
-
-        execute_window_loop(win_out, [&](const Coordinates & id)
+        if(id.x() > width_matrix_b)
         {
-            if(id.x() > width_matrix_b)
-            {
-                return;
-            }
+            return;
+        }
 
-            // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-            typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
-            {
-                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
-            };
+        // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
+        typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
+        {
+            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
+        };
 
-            const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
+        const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
 
 #if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
-            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
+        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
+        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
 #endif /* __arm__ */
 
-            int i = 0;
-            // This for loop performs 4 accumulations
-            for(; i <= (_k - 4); i += 4)
-            {
-                const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
-                const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
-                const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
-                const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
+        int i = 0;
+        // This for loop performs 4 accumulations
+        for(; i <= (_k - 4); i += 4)
+        {
+            const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+            const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
+            const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
+            const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
 
 #if __arm__
-                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
-                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
-                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
-                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
+            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
+            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
+            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
+            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
 #endif /* __arm__ */
 
-                // Partial accumulation in 16bit
-                typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
-                {
-                    wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
-                    wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
-                };
-
-                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
-                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
-                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
-                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
-                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
-                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
-                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
-                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
-
-                // Accumulate to 32bit
-                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
-                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
-                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
-                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
-
-                matrix_b += 4 * in_b_stride;
-            }
-
-            // This for loop perfoms the leftover accumulations
-            for(; i < _k; ++i)
+            // Partial accumulation in 16bit
+            typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
             {
-                const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
-
-                // Convert S8 to S16
-                const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
-                {
-                    wrapper::vmovl(wrapper::vgetlow(b0_b8)),
-                    wrapper::vmovl(wrapper::vgethigh(b0_b8))
-                };
-
-                // Accumulate to 32bit
-                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
-                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
-                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
-                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
+                wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
+            };
 
-                matrix_b += in_b_stride;
-            }
+            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
+            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
+            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
+            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
+            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
+            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
+            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
+            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
+
+            // Accumulate to 32bit
+            sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
+            sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
+            sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
+            sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
+
+            matrix_b += 4 * in_b_stride;
+        }
+
+        // This for loop perfoms the leftover accumulations
+        for(; i < _k; ++i)
+        {
+            const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
 
-            // Multiply by scalar if necessary
-            if(_mul_by_scalar)
+            // Convert S8 to S16
+            const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
             {
-                sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
-                sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
-                sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
-                sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
-            }
+                wrapper::vmovl(wrapper::vgetlow(b0_b8)),
+                wrapper::vmovl(wrapper::vgethigh(b0_b8))
+            };
+
+            // Accumulate to 32bit
+            sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+            sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+            sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+            sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
 
-            auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+            matrix_b += in_b_stride;
+        }
 
+        // Multiply by scalar if necessary
+        if(_mul_by_scalar)
+        {
+            sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
+            sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
+            sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
+            sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
+        }
+
+        auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+        if(id.x() + 16 < width_matrix_b)
+        {
             wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
             wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
             wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
             wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
-        },
-        inb, out);
-    }
+        }
+        else
+        {
+            auto left_over = width_matrix_b - id.x();
+            for(auto k = 0; k < 4 && left_over; ++k)
+            {
+                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                {
+                    *(vector_sum_col + k * 4 + j) = sum_col[k][j];
+                }
+            }
+        }
+    },
+    inb, out);
 }
 
 void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h
similarity index 83%
rename from arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
rename to src/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index 53a542c2df..655658cb6c 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H
 #define ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -46,6 +46,8 @@ class INEGEMMLowpReductionKernel : public INEKernel
     INEGEMMLowpReductionKernel(INEGEMMLowpReductionKernel &&) = default;
     /** Allow instances of this class to be moved */
     INEGEMMLowpReductionKernel &operator=(INEGEMMLowpReductionKernel &&) = default;
+    /** Default destructor */
+    virtual ~INEGEMMLowpReductionKernel() = default;
 
     /** Initialise the kernel's input and output.
      *
@@ -63,7 +65,6 @@ class INEGEMMLowpReductionKernel : public INEKernel
     const ITensor *_input;
     ITensor       *_output;
     int32_t        _k;
-    bool           _is_reshaped;
     int32_t        _scalar;
     bool           _mul_by_scalar;
 };
@@ -80,6 +81,18 @@ class NEGEMMLowpMatrixAReductionKernel : public INEGEMMLowpReductionKernel
     {
         return "NEGEMMLowpMatrixAReductionKernel";
     }
+    /** Default constructor */
+    NEGEMMLowpMatrixAReductionKernel() = default;
+    /** Prevent instances of this class from being copied */
+    NEGEMMLowpMatrixAReductionKernel(const NEGEMMLowpMatrixAReductionKernel &) = delete;
+    /** Prevent instances of this class from being copied */
+    NEGEMMLowpMatrixAReductionKernel &operator=(const NEGEMMLowpMatrixAReductionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixAReductionKernel(NEGEMMLowpMatrixAReductionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixAReductionKernel &operator=(NEGEMMLowpMatrixAReductionKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpMatrixAReductionKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
@@ -129,6 +142,18 @@ class NEGEMMLowpMatrixBReductionKernel : public INEGEMMLowpReductionKernel
     {
         return "NEGEMMLowpMatrixBReductionKernel";
     }
+    /** Default constructor */
+    NEGEMMLowpMatrixBReductionKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpMatrixBReductionKernel(const NEGEMMLowpMatrixBReductionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpMatrixBReductionKernel &operator=(const NEGEMMLowpMatrixBReductionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixBReductionKernel(NEGEMMLowpMatrixBReductionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixBReductionKernel &operator=(NEGEMMLowpMatrixBReductionKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpMatrixBReductionKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 2cac93ab93..6a2802a991 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
similarity index 96%
rename from arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
rename to src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
index 79f62561da..48377838d2 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H
 #define ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -55,6 +55,8 @@ class NEGEMMMatrixAdditionKernel : public INESimpleKernel
     NEGEMMMatrixAdditionKernel(NEGEMMMatrixAdditionKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEGEMMMatrixAdditionKernel &operator=(NEGEMMMatrixAdditionKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMMatrixAdditionKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @note The input and output tensor must have the same dimensions
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index 5bec9d321b..fc95c08f62 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,43 +21,35 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
 
 #include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
 
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
-template <bool multiply_alpha>
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 void vector_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
 {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
-    const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+    const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / input1->info()->element_size());
     const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
 
     // The implementation computes 32 elements per iteration
@@ -67,7 +59,7 @@ void vector_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, IT
     ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
 
     Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
     win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
 
     Window win_a(window);
@@ -81,125 +73,174 @@ void vector_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, IT
     {
         win_b = window;
     }
-    win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
     Iterator ina(input0, win_a);
     Iterator inb(input1, win_b);
     Iterator out(output, win_out);
 
+    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
+
     const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
-    ARM_COMPUTE_UNUSED(alpha_f16);
 
-    execute_window_loop(win_out, [&](const Coordinates & id)
+    execute_window_loop(win_out, [&](const Coordinates &)
     {
-        if(id.x() > width_matrix_b)
+        int x = window_start_x;
+        // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
+        // window_end_x is computed above which may cause out-of-bound writes to the output.
+        for(; x < (window_end_x - window_step_x); x += window_step_x)
         {
-            return;
-        }
+            if(x > width_matrix_b)
+            {
+                return;
+            }
 
-        float16x8_t acc0 = vdupq_n_f16(0.f);
-        float16x8_t acc1 = vdupq_n_f16(0.f);
-        float16x8_t acc2 = vdupq_n_f16(0.f);
-        float16x8_t acc3 = vdupq_n_f16(0.f);
+            auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
 
-        auto vec_a    = reinterpret_cast<const float16_t *>(ina.ptr());
-        auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr());
+            float16x8_t acc0 = vdupq_n_f16(0.f);
+            float16x8_t acc1 = vdupq_n_f16(0.f);
+            float16x8_t acc2 = vdupq_n_f16(0.f);
+            float16x8_t acc3 = vdupq_n_f16(0.f);
 
-        const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
-        for(; vec_a <= (vec_a_end_addr - 4);)
-        {
-            const float16x4_t a0l = vld1_f16(vec_a);
-
-            float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-            float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-            float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-            float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-            float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
-            float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
-            float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
-            float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
-            acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
-            acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
-            acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
-            acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
-            acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
-            acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
-            acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
-            acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
-
-            matrix_b += 2 * in_b_stride;
-
-            b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-            b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-            b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-            b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-            b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
-            b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
-            b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
-            b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
-            acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
-            acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
-            acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
-            acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
-            acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
-            acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
-            acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
-            acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
-
-            vec_a += 4;
-            matrix_b += 2 * in_b_stride;
-        }
+            auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
+            const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+            for(; vec_a <= (vec_a_end_addr - 4);)
+            {
+                const float16x4_t a0l = vld1_f16(vec_a);
+
+                float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+                float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+                float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+                float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+                float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
+                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
+                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
+                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
+                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
+                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
+                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
+                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
+
+                matrix_b += 2 * in_b_stride;
+
+                b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+                b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+                b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+                b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+                b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
+                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
+                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
+                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
+                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
+                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
+                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
+                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
+
+                vec_a += 4;
+                matrix_b += 2 * in_b_stride;
+            }
 
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const float16_t   a0  = *vec_a;
-            const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-            const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-            const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-            const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-
-            acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
-            acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
-            acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
-            acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
-
-            vec_a += 1;
-            matrix_b += in_b_stride;
+            for(; vec_a < vec_a_end_addr; ++vec_a)
+            {
+                const float16_t   a0  = *vec_a;
+                const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+
+                acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
+                acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
+                acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
+                acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
+
+                matrix_b += in_b_stride;
+            }
+
+            // Multiply by the weight of matrix product (alpha)
+            if(multiply_alpha)
+            {
+                acc0 = vmulq_f16(acc0, alpha_f16);
+                acc1 = vmulq_f16(acc1, alpha_f16);
+                acc2 = vmulq_f16(acc2, alpha_f16);
+                acc3 = vmulq_f16(acc3, alpha_f16);
+            }
+
+            auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+
+            vst1q_f16(vec_out + 0, acc0);
+            vst1q_f16(vec_out + 8, acc1);
+            vst1q_f16(vec_out + 16, acc2);
+            vst1q_f16(vec_out + 24, acc3);
         }
 
-        // Multiply by the weight of matrix product (alpha)
-        if(multiply_alpha)
+        for(; x < window_end_x; ++x)
         {
-            acc0 = vmulq_f16(acc0, alpha_f16);
-            acc1 = vmulq_f16(acc1, alpha_f16);
-            acc2 = vmulq_f16(acc2, alpha_f16);
-            acc3 = vmulq_f16(acc3, alpha_f16);
-        }
+            if(x > width_matrix_b)
+            {
+                return;
+            }
+
+            auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
 
-        const auto vec_out = reinterpret_cast<float16_t *>(out.ptr());
+            float16x4_t vacc = vdup_n_f16(0.f);
+
+            auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
+            const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+            for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
+            {
+                const float16x4_t a0l = vld1_f16(vec_a);
 
-        vst1q_f16(vec_out + 0, acc0);
-        vst1q_f16(vec_out + 8, acc1);
-        vst1q_f16(vec_out + 16, acc2);
-        vst1q_f16(vec_out + 24, acc3);
+                const float16x4_t b_col =
+                {
+                    *(matrix_b + 0 * in_b_stride),
+                    *(matrix_b + 1 * in_b_stride),
+                    *(matrix_b + 2 * in_b_stride),
+                    *(matrix_b + 3 * in_b_stride),
+                };
+
+                vacc = vadd_f16(vacc, vmul_f16(a0l, b_col));
+
+                matrix_b += 4 * in_b_stride;
+            }
+
+            float16_t acc = vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3);
+
+            for(; vec_a < vec_a_end_addr; ++vec_a)
+            {
+                const float16_t a0  = *vec_a;
+                const float16_t b00 = *matrix_b;
+
+                acc += b00 * a0;
+
+                matrix_b += in_b_stride;
+            }
 
+            // Multiply by the weight of matrix product (alpha)
+            if(multiply_alpha)
+            {
+                acc *= static_cast<float16_t>(alpha);
+            }
+
+            auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+
+            *(vec_out) = acc;
+        }
     },
     ina, inb, out);
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    ARM_COMPUTE_UNUSED(input0);
-    ARM_COMPUTE_UNUSED(input1);
-    ARM_COMPUTE_UNUSED(output);
-    ARM_COMPUTE_UNUSED(window);
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_ERROR("Not implemented");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-template <bool multiply_alpha>
 void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
 {
     const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
@@ -213,7 +254,7 @@ void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, IT
     const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
 
     Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
     win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
 
     Window win_a(window);
@@ -227,137 +268,215 @@ void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, IT
     {
         win_b = window;
     }
-    win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
     Iterator ina(input0, win_a);
     Iterator inb(input1, win_b);
     Iterator out(output, win_out);
 
-    execute_window_loop(win_out, [&](const Coordinates & id)
+    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
+
+    const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
+
+    execute_window_loop(win_out, [&](const Coordinates &)
     {
-        if(id.x() > width_matrix_b)
+        int x = window_start_x;
+        // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
+        // window_end_x is computed above which may cause out-of-bound writes to the output.
+        for(; x < (window_end_x - window_step_x); x += window_step_x)
         {
-            return;
-        }
+            if(x > width_matrix_b)
+            {
+                return;
+            }
 
-        float32x4_t acc0 = vdupq_n_f32(0.f);
-        float32x4_t acc1 = vdupq_n_f32(0.f);
-        float32x4_t acc2 = vdupq_n_f32(0.f);
-        float32x4_t acc3 = vdupq_n_f32(0.f);
+            float32x4_t acc0 = vdupq_n_f32(0.f);
+            float32x4_t acc1 = vdupq_n_f32(0.f);
+            float32x4_t acc2 = vdupq_n_f32(0.f);
+            float32x4_t acc3 = vdupq_n_f32(0.f);
 
-        auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
-        auto matrix_b = reinterpret_cast<const float *>(inb.ptr());
+            auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+            auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
 
 #if __arm__
-        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
-        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
 #endif /* __arm__ */
 
-        auto vec_a_end_addr = vec_a + num_elems_vec_a;
-        for(; vec_a <= (vec_a_end_addr - 4);)
-        {
-            float32x2_t a0l = vld1_f32(vec_a);
+            auto vec_a_end_addr = vec_a + num_elems_vec_a;
+            for(; vec_a <= (vec_a_end_addr - 4);)
+            {
+                float32x2_t a0l = vld1_f32(vec_a);
 
-            float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-            float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-            float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-            float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+                float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
 
-            float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
-            float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
-            float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
-            float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+                float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+                float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+                float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+                float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
 
 #if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
-            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
-            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
-            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
 #endif /* __arm__ */
 
-            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
-            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
-            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
-            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+                acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+                acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+                acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+                acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
 
-            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
-            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
-            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
-            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+                acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+                acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+                acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+                acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
 
-            vec_a += 2;
-            matrix_b += 2 * in_b_stride;
+                vec_a += 2;
+                matrix_b += 2 * in_b_stride;
 
-            a0l = vld1_f32(vec_a);
+                a0l = vld1_f32(vec_a);
 
-            b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-            b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-            b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-            b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+                b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
 
-            b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
-            b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
-            b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
-            b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+                b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+                b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+                b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+                b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
 
-            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
-            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
-            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
-            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+                acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+                acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+                acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+                acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
 
-            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
-            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
-            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
-            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+                acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+                acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+                acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+                acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
 
-            vec_a += 2;
-            matrix_b += 2 * in_b_stride;
-        }
+                vec_a += 2;
+                matrix_b += 2 * in_b_stride;
+            }
 
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const float a0 = *vec_a;
+            for(; vec_a < vec_a_end_addr; ++vec_a)
+            {
+                const float a0 = *vec_a;
+
+                const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
 
-            const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-            const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-            const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-            const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+                acc0 = vmlaq_n_f32(acc0, b00, a0);
+                acc1 = vmlaq_n_f32(acc1, b01, a0);
+                acc2 = vmlaq_n_f32(acc2, b02, a0);
+                acc3 = vmlaq_n_f32(acc3, b03, a0);
 
-            acc0 = vmlaq_n_f32(acc0, b00, a0);
-            acc1 = vmlaq_n_f32(acc1, b01, a0);
-            acc2 = vmlaq_n_f32(acc2, b02, a0);
-            acc3 = vmlaq_n_f32(acc3, b03, a0);
+                matrix_b += in_b_stride;
+            }
 
-            vec_a += 1;
-            matrix_b += in_b_stride;
+            // Multiply by the weight of matrix product (alpha)
+            if(multiply_alpha)
+            {
+                acc0 = vmulq_f32(acc0, alpha_f32);
+                acc1 = vmulq_f32(acc1, alpha_f32);
+                acc2 = vmulq_f32(acc2, alpha_f32);
+                acc3 = vmulq_f32(acc3, alpha_f32);
+            }
+
+            const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
+
+            vst1q_f32(vec_out + 0, acc0);
+            vst1q_f32(vec_out + 4, acc1);
+            vst1q_f32(vec_out + 8, acc2);
+            vst1q_f32(vec_out + 12, acc3);
         }
 
-        // Multiply by the weight of matrix product (alpha)
-        if(multiply_alpha)
+        // Left-over loop
+        for(; x < window_end_x; ++x)
         {
-            const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
-            acc0                        = vmulq_f32(acc0, alpha_f32);
-            acc1                        = vmulq_f32(acc1, alpha_f32);
-            acc2                        = vmulq_f32(acc2, alpha_f32);
-            acc3                        = vmulq_f32(acc3, alpha_f32);
-        }
+            if(x > width_matrix_b)
+            {
+                return;
+            }
+
+            float32x4_t vacc = vdupq_n_f32(0.f);
+
+            auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+            auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+#endif /* __arm__ */
+
+            auto vec_a_end_addr = vec_a + num_elems_vec_a;
+            for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
+            {
+                const float32x4_t a0l = vld1q_f32(vec_a);
+
+                const float32x4_t b_col =
+                {
+                    *(matrix_b + 0 * in_b_stride),
+                    *(matrix_b + 1 * in_b_stride),
+                    *(matrix_b + 2 * in_b_stride),
+                    *(matrix_b + 3 * in_b_stride),
+                };
+
+#if __arm__
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif /* __arm__ */
+
+                vacc = vmlaq_f32(vacc, b_col, a0l);
+
+                matrix_b += 4 * in_b_stride;
+            }
+
+            float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + vgetq_lane_f32(vacc, 3);
+
+            for(; vec_a < vec_a_end_addr; ++vec_a)
+            {
+                const float a0 = *vec_a;
 
-        const auto vec_out = reinterpret_cast<float *>(out.ptr());
+                const float b00 = *matrix_b;
 
-        vst1q_f32(vec_out + 0, acc0);
-        vst1q_f32(vec_out + 4, acc1);
-        vst1q_f32(vec_out + 8, acc2);
-        vst1q_f32(vec_out + 12, acc3);
+                acc += b00 * a0;
+
+                matrix_b += in_b_stride;
+            }
+
+            // Multiply by the weight of matrix product (alpha)
+            if(multiply_alpha)
+            {
+                acc *= alpha;
+            }
+
+            const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
+
+            *vec_out = acc;
+        }
     },
     ina, inb, out);
 }
 
-template <bool multiply_alpha>
 void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
 {
+    const int    out_width            = static_cast<int>(output->info()->dimension(0));
+    const int    out_height           = static_cast<int>(output->info()->dimension(1));
     const size_t in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
     const size_t out_stride1          = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
     const size_t out_stride2          = out_stride1 * 2;
@@ -385,10 +504,14 @@ void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, IT
     Iterator inb(input1, win_b);
     Iterator out(output, window);
 
+    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
+
+    const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
+
     // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
     // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
     // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates &)
+    execute_window_loop(window, [&](const Coordinates & id)
     {
         auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
         auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
@@ -630,37 +753,103 @@ void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, IT
         // Multiply by the weight of matrix product (alpha)
         if(multiply_alpha)
         {
-            const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
-            acc00                       = vmulq_f32(acc00, alpha_f32);
-            acc10                       = vmulq_f32(acc10, alpha_f32);
-            acc20                       = vmulq_f32(acc20, alpha_f32);
-            acc30                       = vmulq_f32(acc30, alpha_f32);
-            acc01                       = vmulq_f32(acc01, alpha_f32);
-            acc11                       = vmulq_f32(acc11, alpha_f32);
-            acc21                       = vmulq_f32(acc21, alpha_f32);
-            acc31                       = vmulq_f32(acc31, alpha_f32);
+            acc00 = vmulq_f32(acc00, alpha_f32);
+            acc10 = vmulq_f32(acc10, alpha_f32);
+            acc20 = vmulq_f32(acc20, alpha_f32);
+            acc30 = vmulq_f32(acc30, alpha_f32);
+            acc01 = vmulq_f32(acc01, alpha_f32);
+            acc11 = vmulq_f32(acc11, alpha_f32);
+            acc21 = vmulq_f32(acc21, alpha_f32);
+            acc31 = vmulq_f32(acc31, alpha_f32);
         }
 
         const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
         const auto mtx_out1 = mtx_out0 + 4;
 
-        // Store the 4 blocks
-        vst1q_f32(mtx_out0, acc00);
-        vst1q_f32(mtx_out1, acc01);
-        vst1q_f32(mtx_out0 + out_stride1, acc10);
-        vst1q_f32(mtx_out1 + out_stride1, acc11);
-        vst1q_f32(mtx_out0 + out_stride2, acc20);
-        vst1q_f32(mtx_out1 + out_stride2, acc21);
-        vst1q_f32(mtx_out0 + out_stride3, acc30);
-        vst1q_f32(mtx_out1 + out_stride3, acc31);
+        if(id.x() < (out_width - 8))
+        {
+            vst1q_f32(mtx_out0, acc00);
+            vst1q_f32(mtx_out1, acc01);
+            if(id.y() + 1 < out_height)
+            {
+                vst1q_f32(mtx_out0 + out_stride1, acc10);
+                vst1q_f32(mtx_out1 + out_stride1, acc11);
+                if(id.y() + 2 < out_height)
+                {
+                    vst1q_f32(mtx_out0 + out_stride2, acc20);
+                    vst1q_f32(mtx_out1 + out_stride2, acc21);
+                    if(id.y() + 3 < out_height)
+                    {
+                        vst1q_f32(mtx_out0 + out_stride3, acc30);
+                        vst1q_f32(mtx_out1 + out_stride3, acc31);
+                    }
+                }
+            }
+        }
+        else if(id.x() < (out_width - 4))
+        {
+            vst1q_f32(mtx_out0, acc00);
+            if(id.y() + 1 < out_height)
+            {
+                vst1q_f32(mtx_out0 + out_stride1, acc10);
+                if(id.y() + 2 < out_height)
+                {
+                    vst1q_f32(mtx_out0 + out_stride2, acc20);
+                    if(id.y() + 3 < out_height)
+                    {
+                        vst1q_f32(mtx_out0 + out_stride3, acc30);
+                    }
+                }
+            }
+            // Left-over columns
+            const int columns_left = out_width - id.x() - 4;
+            for(auto x = 0; x < columns_left; ++x)
+            {
+                *(mtx_out1 + x) = acc01[x];
+                if(id.y() + 1 < out_height)
+                {
+                    *(mtx_out1 + x + out_stride1) = acc11[x];
+                    if(id.y() + 2 < out_height)
+                    {
+                        *(mtx_out1 + x + out_stride2) = acc21[x];
+                        if(id.y() + 3 < out_height)
+                        {
+                            *(mtx_out1 + x + out_stride3) = acc31[x];
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            // Left-over columns
+            const int columns_left = out_width - id.x();
+            for(int x = 0; x < columns_left; ++x)
+            {
+                *(mtx_out0 + x) = acc00[x];
+                if(id.y() + 1 < out_height)
+                {
+                    *(mtx_out0 + x + out_stride1) = acc10[x];
+                    if(id.y() + 2 < out_height)
+                    {
+                        *(mtx_out0 + x + out_stride2) = acc20[x];
+                        if(id.y() + 3 < out_height)
+                        {
+                            *(mtx_out0 + x + out_stride3) = acc30[x];
+                        }
+                    }
+                }
+            }
+        }
     },
     ina, inb, out);
 }
 
-template <bool multiply_alpha>
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
 {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    const int    out_width            = static_cast<int>(output->info()->dimension(0));
+    const int    out_height           = static_cast<int>(output->info()->dimension(1));
     const size_t in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
     const size_t out_stride           = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
     const int    num_elems_matrix_b_x = input1->info()->dimension(0);
@@ -685,9 +874,11 @@ void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, IT
     Iterator inb(input1, win_b);
     Iterator out(output, window);
 
+    const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));
+
     const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
 
-    execute_window_loop(window, [&](const Coordinates &)
+    execute_window_loop(window, [&](const Coordinates & id)
     {
         const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
         const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
@@ -790,21 +981,47 @@ void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, IT
             c.val[3] = vmulq_f16(c.val[3], alpha_f16);
         }
 
-        vst1q_f16(mtx_out + 0 * out_stride, c.val[0]);
-        vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
-        vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
-        vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
+        if(id.x() < (out_width - 8))
+        {
+            vst1q_f16(mtx_out, c.val[0]);
+            if(id.y() + 1 < out_height)
+            {
+                vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
+                if(id.y() + 2 < out_height)
+                {
+                    vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
+                    if(id.y() + 3 < out_height)
+                    {
+                        vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
+                    }
+                }
+            }
+        }
+        else
+        {
+            // Left-over columns
+            const int columns_left = out_width - id.x();
+            for(int x = 0; x < columns_left; ++x)
+            {
+                *(mtx_out + x) = c.val[0][x];
+                if(id.y() + 1 < out_height)
+                {
+                    *(mtx_out + x + 1 * out_stride) = c.val[1][x];
+                    if(id.y() + 2 < out_height)
+                    {
+                        *(mtx_out + x + 2 * out_stride) = c.val[2][x];
+                        if(id.y() + 3 < out_height)
+                        {
+                            *(mtx_out + x + 3 * out_stride) = c.val[3][x];
+                        }
+                    }
+                }
+            }
+        }
     },
     ina, inb, out);
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    ARM_COMPUTE_UNUSED(input0);
-    ARM_COMPUTE_UNUSED(input1);
-    ARM_COMPUTE_UNUSED(output);
-    ARM_COMPUTE_UNUSED(window);
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_ERROR("Not implemented");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
 {
@@ -866,92 +1083,6 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
 
     return Status{};
 }
-
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
-{
-    bool   window_changed{};
-    Window win{};
-
-    unsigned int       num_elems_processed_per_iteration_x = 0;
-    const unsigned int num_elems_processed_per_iteration_y = 4;
-
-    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    if((output->dimension(1) == 1))
-    {
-        switch(input0->data_type())
-        {
-            case DataType::F32:
-            {
-                num_elems_processed_per_iteration_x = 16;
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                num_elems_processed_per_iteration_x = 32;
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            default:
-            {
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-            }
-        }
-
-        // Configure kernel window
-        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
-
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
-
-        window_changed = update_window_and_padding(win,
-                                                   AccessWindowStatic(input0, 0, 0, input0->tensor_shape().x(), 1),
-                                                   AccessWindowHorizontal(input1, 0, num_elems_processed_per_iteration_x),
-                                                   output_access);
-
-        Coordinates coord;
-        coord.set_num_dimensions(output->num_dimensions());
-        output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
-    }
-    else
-    {
-        switch(input0->data_type())
-        {
-            case DataType::F32:
-            {
-                num_elems_processed_per_iteration_x = 8;
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                num_elems_processed_per_iteration_x = 8;
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            default:
-            {
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-            }
-        }
-
-        // Configure kernel window
-        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        window_changed = update_window_and_padding(win,
-                                                   AccessWindowRectangle(input0, 0, 0, 4, 1, 1.f, 0.25f),
-                                                   AccessWindowStatic(input1, 0, 0, input1->tensor_shape().x(), ceil_to_multiple(input1->tensor_shape().y(), 4)),
-                                                   output_access);
-
-        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 NEGEMMMatrixMultiplyKernel::NEGEMMMatrixMultiplyKernel()
@@ -979,16 +1110,33 @@ void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor
     _alpha  = alpha;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    Window win{};
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if((output->info()->dimension(1) == 1))
+    {
+        const unsigned int num_elems_processed_per_iteration_x = (input0->info()->data_type() == DataType::F32) ? 16 : 32;
+
+        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+    }
+    else
+    {
+        constexpr unsigned int num_elems_processed_per_iteration_x = 8;
+        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    }
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+    INEKernel::configure(win);
 }
 
 Status NEGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved,
                                             const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, alpha, is_interleaved, reshape_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
 
     return Status{};
 }
@@ -998,57 +1146,29 @@ void NEGEMMMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &inf
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const bool multiply_alpha = !(helpers::float_ops::is_one(_alpha));
-
     // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    if((_output->info()->dimension(1) == 1))
+    const bool is_output_vector = (_output->info()->dimension(1) == 1);
+    switch(_input0->info()->data_type())
     {
-        switch(_input0->info()->data_type())
+        case DataType::F32:
         {
-            case DataType::F32:
-            {
-                multiply_alpha ? vector_matrix_multiply_f32<true>(_input0, _input1, _output, window, info, _alpha) :
-                vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, info, _alpha);
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                multiply_alpha ? vector_matrix_multiply_f16<true>(_input0, _input1, _output, window, info, _alpha) :
-                vector_matrix_multiply_f16<false>(_input0, _input1, _output, window, info, _alpha);
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            default:
-            {
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-            }
+            is_output_vector ? vector_matrix_multiply_f32(_input0, _input1, _output, window, info, _alpha) :
+            matrix_matrix_multiply_f32(_input0, _input1, _output, window, _alpha);
+            break;
         }
-    }
-    else
-    {
-        switch(_input0->info()->data_type())
-        {
-            case DataType::F32:
-            {
-                multiply_alpha ? matrix_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha) :
-                matrix_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
-                break;
-            }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                multiply_alpha ? matrix_matrix_multiply_f16<true>(_input0, _input1, _output, window, _alpha) :
-                matrix_matrix_multiply_f16<false>(_input0, _input1, _output, window, _alpha);
-                break;
-            }
+        case DataType::F16:
+        {
+            is_output_vector ? vector_matrix_multiply_f16(_input0, _input1, _output, window, info, _alpha) :
+            matrix_matrix_multiply_f16(_input0, _input1, _output, window, _alpha);
+            break;
+        }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            default:
-            {
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-            }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
         }
     }
 }
+} // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
rename to src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
index f79e07ebb4..1ea948de63 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H
 #define ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 951cb19679..6d9f921b02 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
@@ -60,28 +62,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
-    // Configure window in case of configured output
-    if(output->total_size() != 0)
-    {
-        AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-
-    const bool window_changed = update_window_and_padding(win, input_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
@@ -97,16 +77,21 @@ void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
     _input  = input;
     _output = output;
 
+    const size_t vector_size = 16 / input->info()->element_size();
+
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    Window win = calculate_max_window(*input->info(), Steps(vector_size));
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
 }
 
 Status NEGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
     return Status{};
 }
@@ -136,52 +121,29 @@ void NEGEMMTranspose1xWKernel::run(const Window &window, const ThreadInfo &info)
     Iterator in(_input, window);
     Iterator out(_output, win_out);
 
-    switch(_input->info()->element_size())
+    const size_t in_width     = _input->info()->dimension(0);
+    const size_t element_size = _input->info()->element_size();
+    const size_t out_stride   = _output->info()->strides_in_bytes()[1];
+    const size_t vector_size  = 16 / element_size;
+
+    execute_window_loop(window, [&](const Coordinates & id)
     {
-        case 1:
-        {
-            const size_t out_stride = _output->info()->strides_in_bytes()[1];
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                // Output address = base addr + (y * 16) + (x / 16 ) * stride
-                const uint8_t *in_ptr  = in.ptr();
-                uint8_t *const out_ptr = out.ptr() + (id.y() << 4) + (id.x() >> 4) * out_stride;
-                vst1q_u8(out_ptr, vld1q_u8(in_ptr));
-            },
-            in, out);
-            break;
-        }
-        case 2:
+        const uint8_t *in_ptr  = in.ptr();
+        uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
+
+        for(size_t k = 0; k < vector_size; ++k)
         {
-            const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(int16_t);
-            execute_window_loop(window, [&](const Coordinates & id)
+            // If the input width is not multiple of W, we fill the reference with 0s
+            if((id.x() + k) >= in_width)
             {
-                // Output address = base addr + (y * 8) + (x / 8 ) * stride
-                const auto in_ptr  = reinterpret_cast<const uint16_t *>(in.ptr());
-                const auto out_ptr = reinterpret_cast<uint16_t *>(out.ptr()) + (id.y() << 3) + (id.x() >> 3) * out_stride;
-                vst1q_u16(out_ptr, vld1q_u16(in_ptr));
-            },
-            in, out);
-            break;
-        }
-        case 4:
-        {
-            const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(float);
-            execute_window_loop(window, [&](const Coordinates & id)
+                std::memset(out_ptr + k * element_size, 0, element_size);
+            }
+            else
             {
-                // Output address = base addr + (y * 4) + (x / 4 ) * stride
-                const auto in_ptr  = reinterpret_cast<const uint32_t *>(in.ptr());
-                const auto out_ptr = reinterpret_cast<uint32_t *>(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride;
-                vst1q_u32(out_ptr, vld1q_u32(in_ptr));
-            },
-            in, out);
-            break;
+                std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
+            }
         }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-        }
-    }
+    },
+    in, out);
 }
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
similarity index 81%
rename from arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
rename to src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
index 756ac6a852..7120943a90 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H
 #define ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -73,6 +73,18 @@ class NEGEMMTranspose1xWKernel : public INESimpleKernel
     {
         return "NEGEMMTranspose1xWKernel";
     }
+    /** Constructor */
+    NEGEMMTranspose1xWKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMTranspose1xWKernel(const NEGEMMTranspose1xWKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMTranspose1xWKernel &operator=(const NEGEMMTranspose1xWKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMTranspose1xWKernel(NEGEMMTranspose1xWKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMTranspose1xWKernel &operator=(NEGEMMTranspose1xWKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMTranspose1xWKernel() = default;
     /** Initialise the kernel's input and output.
      *
      * @param[in]  input  Input tensor. Data types supported: All
diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp
index 906e8a053e..55ecb8840f 100644
--- a/src/core/NEON/kernels/NEGatherKernel.cpp
+++ b/src/core/NEON/kernels/NEGatherKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,9 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
+#include "src/core/NEON/kernels/NEGatherKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -32,6 +31,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NEGatherKernel.h b/src/core/NEON/kernels/NEGatherKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEGatherKernel.h
rename to src/core/NEON/kernels/NEGatherKernel.h
index 31d4f19ed0..d81e34c39c 100644
--- a/arm_compute/core/NEON/kernels/NEGatherKernel.h
+++ b/src/core/NEON/kernels/NEGatherKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,8 @@
 #ifndef ARM_COMPUTE_NEGATHERKERNEL_H
 #define ARM_COMPUTE_NEGATHERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
index 18dd80e283..63b26ab7c0 100644
--- a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
+#include "src/core/NEON/kernels/NEGaussian3x3Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h b/src/core/NEON/kernels/NEGaussian3x3Kernel.h
similarity index 74%
rename from arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
rename to src/core/NEON/kernels/NEGaussian3x3Kernel.h
index c8141817db..8973b48e7a 100644
--- a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
+++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H
 #define ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -38,6 +38,18 @@ class NEGaussian3x3Kernel : public INESimpleKernel
     {
         return "NEGaussian3x3Kernel";
     }
+    /** Constructor */
+    NEGaussian3x3Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian3x3Kernel(const NEGaussian3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian3x3Kernel &operator=(const NEGaussian3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussian3x3Kernel(NEGaussian3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussian3x3Kernel &operator=(NEGaussian3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NEGaussian3x3Kernel() = default;
     /** Set the source, destination and border mode of the kernel
      *
      * @param[in]  input            Source tensor. Data type supported: U8
diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
index 99b5d4b093..ab2feb0dc2 100644
--- a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "src/core/NEON/kernels/NEGaussian5x5Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
@@ -110,6 +112,10 @@ void NEGaussian5x5HorKernel::run(const Window &window, const ThreadInfo &info)
     input, output);
 }
 
+NEGaussian5x5VertKernel::NEGaussian5x5VertKernel()
+{
+}
+
 BorderSize NEGaussian5x5VertKernel::border_size() const
 {
     return BorderSize{ 2, 0 };
diff --git a/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h b/src/core/NEON/kernels/NEGaussian5x5Kernel.h
similarity index 67%
rename from arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
rename to src/core/NEON/kernels/NEGaussian5x5Kernel.h
index b489f4b458..f4bca55637 100644
--- a/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
+++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H
 #define ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -40,6 +40,16 @@ class NEGaussian5x5HorKernel : public INESimpleKernel
     }
     /** Default constructor */
     NEGaussian5x5HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian5x5HorKernel(NEGaussian5x5HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian5x5HorKernel &operator=(NEGaussian5x5HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussian5x5HorKernel(NEGaussian5x5HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussian5x5HorKernel &operator=(NEGaussian5x5HorKernel &&) = default;
+    /** Default destructor */
+    ~NEGaussian5x5HorKernel() = default;
 
     /** Initialise the kernel's source, destination and border mode.
      *
@@ -65,6 +75,18 @@ class NEGaussian5x5VertKernel : public INESimpleKernel
     {
         return "NEGaussian5x5VertKernel";
     }
+    /** Default constructor */
+    NEGaussian5x5VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian5x5VertKernel(NEGaussian5x5VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussian5x5VertKernel &operator=(NEGaussian5x5VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussian5x5VertKernel(NEGaussian5x5VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussian5x5VertKernel &operator=(NEGaussian5x5VertKernel &&) = default;
+    /** Default destructor */
+    ~NEGaussian5x5VertKernel() = default;
     /** Initialise the kernel's source, destination and border mode.
      *
      * @param[in]  input            Source tensor. Data type supported: S16.
diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
index 83d2877836..49c8e9ec3e 100644
--- a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,17 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "src/core/NEON/kernels/NEGaussianPyramidKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h b/src/core/NEON/kernels/NEGaussianPyramidKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
rename to src/core/NEON/kernels/NEGaussianPyramidKernel.h
index 33a4452382..e852db2699 100644
--- a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H
 #define ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index c3b105919b..516a9b68c2 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
+#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
rename to src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
index 7b82488c44..f6d39e50a7 100644
--- a/arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
 #define ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 namespace arm_compute
 {
 class ITensor;
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
index 84bb59ef0e..089cd34e0c 100644
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "src/core/NEON/kernels/NEHOGDescriptorKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/HOGInfo.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h b/src/core/NEON/kernels/NEHOGDescriptorKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
rename to src/core/NEON/kernels/NEHOGDescriptorKernel.h
index b0206ec091..7845bc2cdf 100644
--- a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,8 @@
 #define ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H
 
 #include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Size2D.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
index eb0d45000a..cba1d5538a 100644
--- a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "src/core/NEON/kernels/NEHOGDetectorKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/HOGInfo.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h b/src/core/NEON/kernels/NEHOGDetectorKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
rename to src/core/NEON/kernels/NEHOGDetectorKernel.h
index 2c23a2b11d..45c28099c8 100644
--- a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.h
@@ -26,7 +26,7 @@
 
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "support/Mutex.h"
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 340c694a7c..4159e434b2 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "src/core/NEON/kernels/NEHarrisCornersKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
@@ -31,6 +31,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/src/core/NEON/kernels/NEHarrisCornersKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
rename to src/core/NEON/kernels/NEHarrisCornersKernel.h
index 084dd7deba..4b794107a2 100644
--- a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.h
@@ -27,7 +27,7 @@
 #include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
 #include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
index 8a671bfa23..227013a014 100644
--- a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
@@ -21,18 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstdint>
 
diff --git a/arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
rename to src/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
index 8a5e86acc4..9d100ebff1 100644
--- a/arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
+++ b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
@@ -25,8 +25,8 @@
 #ifndef ARM_COMPUTE_NEHEIGHTCONCATENATELAYERKERNEL_H
 #define ARM_COMPUTE_NEHEIGHTCONCATENATELAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEHistogramKernel.cpp b/src/core/NEON/kernels/NEHistogramKernel.cpp
index 0f8397f117..eddc3b29ab 100644
--- a/src/core/NEON/kernels/NEHistogramKernel.cpp
+++ b/src/core/NEON/kernels/NEHistogramKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
+#include "src/core/NEON/kernels/NEHistogramKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,6 +30,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/arm_compute/core/NEON/kernels/NEHistogramKernel.h b/src/core/NEON/kernels/NEHistogramKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEHistogramKernel.h
rename to src/core/NEON/kernels/NEHistogramKernel.h
index 6e5b92273b..e14519ce25 100644
--- a/arm_compute/core/NEON/kernels/NEHistogramKernel.h
+++ b/src/core/NEON/kernels/NEHistogramKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEHISTOGRAMKERNEL_H
 #define ARM_COMPUTE_NEHISTOGRAMKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "support/Mutex.h"
 
 #include <cstddef>
@@ -46,8 +46,6 @@ class NEHistogramKernel : public INEKernel
     }
     /** Default constructor */
     NEHistogramKernel();
-    /** Default destructor */
-    ~NEHistogramKernel() = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEHistogramKernel(const NEHistogramKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -56,6 +54,8 @@ class NEHistogramKernel : public INEKernel
     NEHistogramKernel(NEHistogramKernel &&) = delete;
     /** Prevent instances of this class from being moved (As this class contains non movable objects) */
     NEHistogramKernel &operator=(NEHistogramKernel &&) = delete;
+    /** Default destructor */
+    ~NEHistogramKernel() = default;
 
     /** Set the input image and the distribution output.
      *
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 1a2b95e8d6..93bfcc501a 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -21,9 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "src/core/NEON/kernels/NEIm2ColKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -31,6 +30,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
@@ -161,7 +163,7 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
             if((y < 0 || y >= input_h) && has_pads)
             {
                 // All the values will be the offset (will be zeros when not quantized)
-                memset(out_ptr, pad_value, kernel_width * sizeof(T));
+                memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T));
                 out_ptr += kernel_width;
             }
             else
@@ -224,7 +226,7 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
         {
             if(y < 0 || y >= input_h)
             {
-                memset(out_ptr, pad_value, pad_quant * element_size);
+                memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
                 out_ptr += pad_quant;
             }
             else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
@@ -233,7 +235,7 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
                 {
                     if(x < 0 || x >= input_w)
                     {
-                        memset(out_ptr, pad_value, input_c * element_size);
+                        memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size);
                         out_ptr += input_c;
                     }
                     else
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/src/core/NEON/kernels/NEIm2ColKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEIm2ColKernel.h
rename to src/core/NEON/kernels/NEIm2ColKernel.h
index 95825ade18..6c1c631d82 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/src/core/NEON/kernels/NEIm2ColKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEIM2COLKERNEL_H
 #define ARM_COMPUTE_NEIM2COLKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index f650d97c45..08bf6f0e76 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -21,19 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
rename to src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
index a5bd453ac7..96c0119719 100644
--- a/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
index 58ee3b4bea..6ee97eea30 100644
--- a/src/core/NEON/kernels/NEIntegralImageKernel.cpp
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
+#include "src/core/NEON/kernels/NEIntegralImageKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h b/src/core/NEON/kernels/NEIntegralImageKernel.h
similarity index 69%
rename from arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
rename to src/core/NEON/kernels/NEIntegralImageKernel.h
index 57f24befdb..8d92504317 100644
--- a/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H
 #define ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -38,6 +38,18 @@ class NEIntegralImageKernel : public INESimpleKernel
     {
         return "NEIntegralImageKernel";
     }
+    /** Default constructor */
+    NEIntegralImageKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIntegralImageKernel(const NEIntegralImageKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIntegralImageKernel &operator=(const NEIntegralImageKernel &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEIntegralImageKernel(NEIntegralImageKernel &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEIntegralImageKernel &operator=(NEIntegralImageKernel &&) = delete;
+    /** Default destructor */
+    ~NEIntegralImageKernel() = default;
     /** Set the source, destination and border mode of the kernel
      *
      * @param[in]  input  Source tensor. Data type supported: U8
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index dbcfda2184..dae5b57fec 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -21,18 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
+#include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 #include <cmath>
 
diff --git a/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
rename to src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
index 302d04e9f3..af3ad3403e 100644
--- a/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEL2NORMALIZELAYERKERNEL_H
 #define ARM_COMPUTE_NEL2NORMALIZELAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
index 533c241b9b..442f001102 100644
--- a/src/core/NEON/kernels/NELKTrackerKernel.cpp
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,9 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "src/core/NEON/kernels/NELKTrackerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -31,6 +30,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h b/src/core/NEON/kernels/NELKTrackerKernel.h
similarity index 94%
rename from arm_compute/core/NEON/kernels/NELKTrackerKernel.h
rename to src/core/NEON/kernels/NELKTrackerKernel.h
index 90e5f41f8a..c24166c042 100644
--- a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
+++ b/src/core/NEON/kernels/NELKTrackerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,8 @@
 #define ARM_COMPUTE_LKTRACKERKERNEL_H
 
 #include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -37,14 +37,6 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Internal keypoint class for Lucas-Kanade Optical Flow */
-struct NELKInternalKeypoint
-{
-    float x{ 0.f };                 /**< x coordinate of the keypoint */
-    float y{ 0.f };                 /**< y coordinate of the keypoint */
-    bool  tracking_status{ false }; /**< the tracking status of the keypoint */
-};
-
 /** Interface for NEON Array of Internal Key Points. */
 using INELKInternalKeypointArray = IArray<NELKInternalKeypoint>;
 
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index dd2824b1b6..f11694dee4 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,19 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
similarity index 95%
rename from arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
rename to src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
index ba14598135..72093b4bb7 100644
--- a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
 #define ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -48,6 +48,8 @@ class NELocallyConnectedMatrixMultiplyKernel : public INEKernel
     NELocallyConnectedMatrixMultiplyKernel(NELocallyConnectedMatrixMultiplyKernel &&) = default;
     /** Allow instances of this class to be moved */
     NELocallyConnectedMatrixMultiplyKernel &operator=(NELocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Default destructor */
+    ~NELocallyConnectedMatrixMultiplyKernel() = default;
     /** Initialise the kernel's input and output
      *
      * @param[in]  input0 First input tensor. Data types supported: F16, F32
diff --git a/src/core/NEON/kernels/NELogicalKernel.cpp b/src/core/NEON/kernels/NELogicalKernel.cpp
new file mode 100644
index 0000000000..27605e15c6
--- /dev/null
+++ b/src/core/NEON/kernels/NELogicalKernel.cpp
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/NEON/kernels/NELogicalKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/common/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace kernels
+{
+namespace
+{
+static const uint8x8_t  c0_x8     = vdup_n_u8(0);
+static const uint8x16_t c0_x16    = vdupq_n_u8(0);
+static const uint8x8_t  c1_x8     = vdup_n_u8(1);
+static const uint8x16_t c1_x16    = vdupq_n_u8(1);
+static const int        step      = 16;
+static const int        half_step = step / 2;
+
+void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int len)
+{
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src0);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
+    ARM_COMPUTE_ASSERT(len >= 0);
+
+    for(; len >= step; len -= step)
+    {
+        vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
+        src0 += step;
+        src1 += step;
+        dst += step;
+    }
+
+    for(; len >= half_step; len -= half_step)
+    {
+        vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
+        src0 += half_step;
+        src1 += half_step;
+        dst += half_step;
+    }
+
+    for(; len > 0; --len)
+    {
+        *dst = (*src0) && (*src1);
+        ++src0;
+        ++src1;
+        ++dst;
+    }
+}
+
+void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_t *dst, int len)
+{
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
+    ARM_COMPUTE_ASSERT(len >= 0);
+
+    const auto broadcast_val_clamped_s   = std::min<uint8_t>(broadcast_val, 1);
+    const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
+    const auto broadcast_val_clamped_x8  = vdup_n_u8(broadcast_val_clamped_s);
+
+    for(; len >= step; len -= step)
+    {
+        vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
+        src += step;
+        dst += step;
+    }
+
+    for(; len >= half_step; len -= half_step)
+    {
+        vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
+        src += half_step;
+        dst += half_step;
+    }
+
+    for(; len > 0; --len)
+    {
+        *dst = (*src) && broadcast_val_clamped_s;
+        ++src;
+        ++dst;
+    }
+}
+
+void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int len)
+{
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src0);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
+    ARM_COMPUTE_ASSERT(len >= 0);
+
+    for(; len >= step; len -= step)
+    {
+        vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
+        src0 += step;
+        src1 += step;
+        dst += step;
+    }
+
+    for(; len >= half_step; len -= half_step)
+    {
+        vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
+        src0 += half_step;
+        src1 += half_step;
+        dst += half_step;
+    }
+
+    for(; len > 0; --len)
+    {
+        *dst = (*src0) || (*src1);
+        ++src0;
+        ++src1;
+        ++dst;
+    }
+}
+
+void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_t *dst, int len)
+{
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
+    ARM_COMPUTE_ASSERT(len >= 0);
+
+    const auto broadcast_val_clamped_s   = std::min<uint8_t>(broadcast_val, 1);
+    const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
+    const auto broadcast_val_clamped_x8  = vdup_n_u8(broadcast_val_clamped_s);
+
+    for(; len >= step; len -= step)
+    {
+        vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
+        src += step;
+        dst += step;
+    }
+
+    for(; len >= half_step; len -= half_step)
+    {
+        vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
+        src += half_step;
+        dst += half_step;
+    }
+
+    for(; len > 0; --len)
+    {
+        *dst = (*src) || broadcast_val_clamped_s;
+        ++src;
+        ++dst;
+    }
+}
+
+void neon_logical_not(const uint8_t *src, uint8_t *dst, int len)
+{
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
+    ARM_COMPUTE_ASSERT(len >= 0);
+
+    for(; len >= step; len -= step)
+    {
+        vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16));
+        src += step;
+        dst += step;
+    }
+
+    for(; len >= half_step; len -= half_step)
+    {
+        vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8));
+        src += half_step;
+        dst += half_step;
+    }
+
+    for(; len > 0; --len)
+    {
+        *dst = !(*src);
+        ++src;
+        ++dst;
+    }
+}
+
+void run_unary(const Window &window, const ITensor *src, ITensor *dst)
+{
+    Window win{ window };
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        neon_logical_not(in.ptr(), out.ptr(), len);
+    },
+    in, out);
+}
+
+void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, ITensor *dst, LogicalOperation op)
+{
+    Window src0_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window src1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    Window win{ window };
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+    const auto len                   = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
+
+    if(is_broadcast_across_x)
+    {
+        using LogicalBroadcastUKernelPtr        = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, int)>::type;
+        LogicalBroadcastUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
+
+        const bool     is_broadcast_input_1 = src1_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_1 ? src1_win : src0_win;
+        Window         non_broadcast_win    = !is_broadcast_input_1 ? src1_win : src0_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_1 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_1 ? src1 : src0;
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_in(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win);
+        Iterator out(dst, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const uint8_t broadcast_value = *broadcast_in.ptr();
+            logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
+
+        },
+        broadcast_in, non_broadcast_in, out);
+    }
+    else
+    {
+        using LogicalUKernelPtr        = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, int)>::type;
+        LogicalUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or : &neon_logical_and;
+
+        src0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        src1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator in0(src0, src0_win);
+        Iterator in1(src1, src1_win);
+        Iterator out(dst, win);
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            logical_func(in0.ptr(), in1.ptr(), out.ptr(), len);
+        },
+        in0, in1, out);
+    }
+}
+} // namespace
+const char *NELogicalKernel::name() const
+{
+    return "NELogicalKernel";
+}
+
+void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input1, input2, output, op));
+
+    _op = op;
+
+    Window      win       = calculate_max_window(*input1, Steps());
+    TensorShape out_shape = input1->tensor_shape();
+    if(op != LogicalOperation::Not)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
+        const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+        out_shape = broadcast_pair.first;
+        win       = calculate_max_window(broadcast_pair.second, Steps());
+    }
+    ICPPKernel::configure(win);
+
+    // Auto initialize if empty
+    set_shape_if_empty(*output, out_shape);
+    set_data_type_if_unknown(*output, input1->data_type());
+}
+
+Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON(op == LogicalOperation::Unknown);
+
+    TensorShape out_shape = input1->tensor_shape();
+    if(op != LogicalOperation::Not)
+    {
+        out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+    }
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+    }
+
+    return Status{};
+}
+
+void NELogicalKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+    const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    if(_op == LogicalOperation::Not)
+    {
+        run_unary(window, src0, dst);
+    }
+    else
+    {
+        run_binary(window, src0, src1, dst, _op);
+    }
+}
+} // namespace kernels
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NELogicalKernel.h b/src/core/NEON/kernels/NELogicalKernel.h
new file mode 100644
index 0000000000..caf69cf45d
--- /dev/null
+++ b/src/core/NEON/kernels/NELogicalKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NELOGICALKERNEL_H
+#define ARM_COMPUTE_NELOGICALKERNEL_H
+
+#include "src/core/KernelTypes.h"
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+namespace kernels
+{
+/** Interface for the kernel to perform logical operations between two tensors
+ *
+ * Supported logical operations:
+ *  - AND
+ *  - OR
+ *  - NOT
+ */
+class NELogicalKernel : public INEKernel
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output Output tensor. Data type supported: U8.
+     * @param[out] op     Logical operation to perform
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogicalKernel
+     *
+     * @param[in] input1 An input tensor. Data type supported: U8.
+     * @param[in] input2 An input tensor. Data type supported: U8
+     * @param[in] output Output tensor. Data type supported: U8.
+     * @param[in] op     Logical operation to perform
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    LogicalOperation _op{};
+};
+} // namespace kernels
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NELOGICALKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
index a0c1dbc668..205f67823d 100644
--- a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h b/src/core/NEON/kernels/NEMagnitudePhaseKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
rename to src/core/NEON/kernels/NEMagnitudePhaseKernel.h
index ea42a38994..3803d05ce9 100644
--- a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H
 #define ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -42,8 +42,6 @@ class NEMagnitudePhaseKernel : public INEKernel
     }
     /** Default constructor */
     NEMagnitudePhaseKernel();
-    /** Destructor */
-    ~NEMagnitudePhaseKernel() = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEMagnitudePhaseKernel(const NEMagnitudePhaseKernel &) = delete;
     /** Default move constructor */
@@ -52,6 +50,8 @@ class NEMagnitudePhaseKernel : public INEKernel
     NEMagnitudePhaseKernel &operator=(const NEMagnitudePhaseKernel &) = delete;
     /** Default move assignment operator */
     NEMagnitudePhaseKernel &operator=(NEMagnitudePhaseKernel &&) = default;
+    /** Destructor */
+    ~NEMagnitudePhaseKernel() = default;
 
     /** Initialise the kernel's input, output.
      *
diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
index 821bf53817..761fa15238 100644
--- a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/ToolchainSupport.h"
 
diff --git a/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
rename to src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
index f3ea049a87..8cdfe2b953 100644
--- a/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H
 #define ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
index 914a21c0a0..a6bb9f2ef7 100644
--- a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "src/core/NEON/kernels/NEMeanStdDevKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -29,6 +29,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h b/src/core/NEON/kernels/NEMeanStdDevKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
rename to src/core/NEON/kernels/NEMeanStdDevKernel.h
index eef0e2b586..e694f3824d 100644
--- a/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
+++ b/src/core/NEON/kernels/NEMeanStdDevKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEMEANSTDDEVKERNEL_H
 #define ARM_COMPUTE_NEMEANSTDDEVKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "support/Mutex.h"
 
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index 3fa44804f5..6a41e3a161 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,16 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
rename to src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
index 66b907541e..59d073ada5 100644
--- a/arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H
 #define ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include <arm_fp16.h>
diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
index 72225a4f43..0160edc650 100644
--- a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
+#include "src/core/NEON/kernels/NEMedian3x3Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <utility>
diff --git a/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h b/src/core/NEON/kernels/NEMedian3x3Kernel.h
similarity index 73%
rename from arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
rename to src/core/NEON/kernels/NEMedian3x3Kernel.h
index f2871e2ab5..b9e28b3053 100644
--- a/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEMEDIAN3x3KERNEL_H
 #define ARM_COMPUTE_NEMEDIAN3x3KERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -38,6 +38,18 @@ class NEMedian3x3Kernel : public INESimpleKernel
     {
         return "NEMedian3x3Kernel";
     }
+    /** Default constructor */
+    NEMedian3x3Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMedian3x3Kernel(const NEMedian3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMedian3x3Kernel &operator=(const NEMedian3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMedian3x3Kernel(NEMedian3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMedian3x3Kernel &operator=(NEMedian3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NEMedian3x3Kernel() = default;
     /** Set the source, destination and border mode of the kernel
      *
      * @param[in]  input            Source tensor. Data type supported: U8
diff --git a/src/core/NEON/kernels/NEMemsetKernel.cpp b/src/core/NEON/kernels/NEMemsetKernel.cpp
index 3870fa57f0..a8dfda3775 100644
--- a/src/core/NEON/kernels/NEMemsetKernel.cpp
+++ b/src/core/NEON/kernels/NEMemsetKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
+#include "src/core/NEON/kernels/NEMemsetKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NEMemsetKernel.h b/src/core/NEON/kernels/NEMemsetKernel.h
similarity index 96%
rename from arm_compute/core/NEON/kernels/NEMemsetKernel.h
rename to src/core/NEON/kernels/NEMemsetKernel.h
index f9a1914360..a720e60251 100644
--- a/arm_compute/core/NEON/kernels/NEMemsetKernel.h
+++ b/src/core/NEON/kernels/NEMemsetKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_NEMEMSETKERNEL_H
 #define ARM_COMPUTE_NEMEMSETKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -43,8 +43,6 @@ class NEMemsetKernel : public INEKernel
     }
     /** Default constructor */
     NEMemsetKernel();
-    /** Default destructor */
-    ~NEMemsetKernel() = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEMemsetKernel(const NEMemsetKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -53,6 +51,8 @@ class NEMemsetKernel : public INEKernel
     NEMemsetKernel(NEMemsetKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEMemsetKernel &operator=(NEMemsetKernel &&) = default;
+    /** Default destructor */
+    ~NEMemsetKernel() = default;
     /** Initialise the kernel's tensor and filling value
      *
      * @param[in,out] tensor         Input tensor to fill. Supported data types: All
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
index b1c2b1c376..92f6b4a42e 100644
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h"
+#include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h b/src/core/NEON/kernels/NEMinMaxLayerKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h
rename to src/core/NEON/kernels/NEMinMaxLayerKernel.h
index e7e87e9339..b4852ad9f2 100644
--- a/arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.h
@@ -25,7 +25,7 @@
 #ifndef ARM_COMPUTE_NEMINMAXLAYERKERNEL_H
 #define ARM_COMPUTE_NEMINMAXLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "support/Mutex.h"
 
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index e956f9a8d0..402e6f1811 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
+#include "src/core/NEON/kernels/NEMinMaxLocationKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h b/src/core/NEON/kernels/NEMinMaxLocationKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
rename to src/core/NEON/kernels/NEMinMaxLocationKernel.h
index 83f5afce72..a24666096f 100644
--- a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.h
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H
 
 #include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "support/Mutex.h"
 
 #include <cstdint>
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index f20e869272..58c0acd404 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
+#include "src/core/NEON/kernels/NENonLinearFilterKernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
@@ -29,6 +29,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h b/src/core/NEON/kernels/NENonLinearFilterKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
rename to src/core/NEON/kernels/NENonLinearFilterKernel.h
index 5fc225c910..3cef12e8ec 100644
--- a/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NENONLINEARFILTERKERNEL_H
 #define ARM_COMPUTE_NENONLINEARFILTERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
 
@@ -51,6 +51,8 @@ class NENonLinearFilterKernel : public INEKernel
     NENonLinearFilterKernel(NENonLinearFilterKernel &&) = default;
     /** Allow instances of this class to be moved */
     NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &&) = default;
+    /** Default destructor */
+    ~NENonLinearFilterKernel() = default;
     /** Set the source, destination and border mode of the kernel
      *
      * @param[in]  input            Source tensor. Data type supported: U8
diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
index 3e4c6e29d3..9f5dfcdcdb 100644
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
rename to src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
index bf5c520978..d32dfecfeb 100644
--- a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H
 #define ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 6cd0780777..27464d5b42 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -21,19 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/NormalizationHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
rename to src/core/NEON/kernels/NENormalizationLayerKernel.h
index 665b10244d..53a06b9ed9 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index d840bb74ff..200fe2ce54 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,16 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEPadLayerKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NEPadLayerKernel.h b/src/core/NEON/kernels/NEPadLayerKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEPadLayerKernel.h
rename to src/core/NEON/kernels/NEPadLayerKernel.h
index 80daabb349..ec4bdffdcd 100644
--- a/arm_compute/core/NEON/kernels/NEPadLayerKernel.h
+++ b/src/core/NEON/kernels/NEPadLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEPADLAYERKERNEL_H
 #define ARM_COMPUTE_NEPADLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp
index 737b10b03c..6a9f5d36ef 100644
--- a/src/core/NEON/kernels/NEPermuteKernel.cpp
+++ b/src/core/NEON/kernels/NEPermuteKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
+#include "src/core/NEON/kernels/NEPermuteKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,21 +30,27 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace
 {
-#include "arm_compute/core/NEON/kernels/convolution/common/shims.hpp"
+#include "src/core/NEON/kernels/convolution/common/shims.hpp"
 } // namespace
 
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 inline bool is_permutation_supported(const PermutationVector &v)
 {
+    static const std::array<PermutationVector, 2> permutations2 =
+    {
+        {
+            PermutationVector(0U, 1U),
+            PermutationVector(1U, 0U),
+        }
+    };
     static const std::array<PermutationVector, 6> permutations3 =
     {
         {
@@ -86,7 +92,8 @@ inline bool is_permutation_supported(const PermutationVector &v)
         }
     };
 
-    return (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
+    return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v))
+           || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
 }
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
@@ -116,7 +123,7 @@ void NEPermuteKernel::run_permute(const Window &window)
     // Input window
     Window window_in = window;
 
-    // we only support these two configs in arm_compute/core/NEON/kernels/convolution/common/shims.hpp, for all others
+    // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others
     // we have to fall back to C++
     if((input_layout == DataLayout::NCHW && _perm == PermutationVector{ 2U, 0U, 1U }) || (input_layout == DataLayout::NHWC && _perm == PermutationVector{ 1U, 2U, 0U }))
     {
@@ -129,7 +136,7 @@ void NEPermuteKernel::run_permute(const Window &window)
     // Output window
     Window                  window_out(window);
     const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
-    for(size_t d = 0; d <= _perm.num_dimensions(); ++d)
+    for(size_t d = 0; d <= _output->info()->num_dimensions(); ++d)
     {
         window_out.set(d, zero_window);
     }
@@ -292,3 +299,4 @@ void NEPermuteKernel::run(const Window &window, const ThreadInfo &info)
         (this->*_func)(window);
     }
 }
+} // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEPermuteKernel.h b/src/core/NEON/kernels/NEPermuteKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEPermuteKernel.h
rename to src/core/NEON/kernels/NEPermuteKernel.h
index 2f8af9373d..80187de9eb 100644
--- a/arm_compute/core/NEON/kernels/NEPermuteKernel.h
+++ b/src/core/NEON/kernels/NEPermuteKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEPERMUTEKERNEL_H
 #define ARM_COMPUTE_NEPERMUTEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index 907a7f197b..39517f6ff6 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -21,13 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NESymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
@@ -49,8 +51,10 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
     ARM_COMPUTE_UNUSED(rounding_policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
+                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
+                                                         DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
                                                          DataType::S16, DataType::QSYMM16,
                                                          DataType::S32, DataType::F16, DataType::F32);
@@ -65,23 +69,24 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
         const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
-                                        "Output can only be U8 if both inputs are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),
-                                        "Output can only be QASYMM8 if both inputs are QASYMM8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8_SIGNED && (input1->data_type() != DataType::QASYMM8_SIGNED || input2->data_type() != DataType::QASYMM8_SIGNED),
-                                        "Output can only be QASYMM8_SIGNED if both inputs are QASYMM8_SIGNED");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
-                                        "Output can only be QSYMM16 if both inputs are QSYMM16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
-                                        "Output can only be S32 if both inputs are QSYMM16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 output");
+        // clang-format off
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            !(input1->data_type() == input2->data_type() && input2->data_type() == output->data_type()) &&
+            !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) &&
+            !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16) &&
+            !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) &&
+            !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) &&
+            !(input1->data_type() == DataType::QSYMM16 && input2->data_type() == DataType::QSYMM16 && output->data_type() == DataType::S32)
+            , "Invalid data type combination");
+        // clang-format on
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::S16 && output->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 output");
     }
 
     if(std::abs(scale - scale255_constant) < 0.00001f)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::S32 && input2->data_type() == DataType::S32 && output->data_type() == DataType::S32,
+                                        "Scale == 1/255 is not supported if input and output are of data type S32");
     }
     else
     {
@@ -151,7 +156,7 @@ void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *o
     const int  window_step_x         = 16 / sizeof(T);
     const auto window_start_x        = static_cast<int>(window.x().start());
     const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
     const UniformQuantizationInfo tmp_qua_info    = { output_qua_info.scale / scale, output_qua_info.offset };
@@ -710,6 +715,213 @@ void mul_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const
     input1, input2, output);
 }
 
+template <bool   is_sat>
+inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &input1, const int32x4_t &input2, int n)
+{
+    const int32x2_t input1_1 = vget_low_s32(input1);
+    const int32x2_t input2_1 = vget_low_s32(input2);
+    const int32x2_t input1_2 = vget_high_s32(input1);
+    const int32x2_t input2_2 = vget_high_s32(input2);
+
+    int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
+    int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
+
+    // Apply scaling, conversion and rounding (round to zero)
+    // Right shift amount
+    const int64x2_t vn = vdupq_n_s64(-n);
+    // Left shift amount
+    const int64x2_t vnl = vdupq_n_s64(n);
+    // Calculate conversion bit
+    const uint64x2_t tmp_1_u   = vreinterpretq_u64_s64(tmp_1);
+    const uint64x2_t sign_1    = vshrq_n_u64(tmp_1_u, 63);
+    const int64x2_t  sign_1_s  = vreinterpretq_s64_u64(sign_1);
+    const int64x2_t  convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
+
+    const uint64x2_t tmp_2_u   = vreinterpretq_u64_s64(tmp_2);
+    const uint64x2_t sign_2    = vshrq_n_u64(tmp_2_u, 63);
+    const int64x2_t  sign_2_s  = vreinterpretq_s64_u64(sign_2);
+    const int64x2_t  convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
+    if(is_sat)
+    {
+        tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
+        tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
+        return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
+    }
+    else
+    {
+        tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
+        tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
+        return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
+    }
+}
+
+template <bool     is_sat>
+inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &input1, const int32x4x2_t &input2, int n)
+{
+    const int32x4x2_t result =
+    {
+        {
+            // First 4 elements
+            mul_S32_S32_S32_n_loop<is_sat>(input1.val[0], input2.val[0], n),
+            // Second 4 elements
+            mul_S32_S32_S32_n_loop<is_sat>(input1.val[1], input2.val[1], n)
+        }
+    };
+
+    return result;
+}
+
+template <bool is_sat>
+void mul_S32_S32_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
+{
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = 8;
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+    if(is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
+            const auto output_ptr              = reinterpret_cast<int32_t *>(output.ptr());
+
+            const int32_t broadcast_value     = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
+            const auto    broadcast_value_vec = vdupq_n_s32(broadcast_value);
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const int32x4x2_t broadcast_v =
+                {
+                    {
+                        broadcast_value_vec,
+                        broadcast_value_vec,
+                    }
+                };
+                const int32x4x2_t non_broadcast_v =
+                {
+                    {
+                        vld1q_s32(non_broadcast_input_ptr + x),
+                        vld1q_s32(non_broadcast_input_ptr + x + 4),
+                    }
+                };
+                const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
+
+                vst1q_s32(output_ptr + x, result.val[0]);
+                vst1q_s32(output_ptr + x + 4, result.val[1]);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
+
+                if(tmp >= 0)
+                {
+                    tmp >>= n;
+                }
+                else
+                {
+                    uint64_t mask = (1u << n) - 1;
+                    tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
+                }
+                if(is_sat)
+                {
+                    tmp = utility::clamp<int64_t, int32_t>(tmp);
+                }
+                *(output_ptr + x) = static_cast<int32_t>(tmp);
+            }
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const int32x4x2_t ta1 =
+                {
+                    {
+                        vld1q_s32(input1_ptr + x),
+                        vld1q_s32(input1_ptr + x + 4),
+                    }
+                };
+                const int32x4x2_t ta2 =
+                {
+                    {
+                        vld1q_s32(input2_ptr + x),
+                        vld1q_s32(input2_ptr + x + 4),
+                    }
+                };
+                const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
+
+                vst1q_s32(output_ptr + x, result.val[0]);
+                vst1q_s32(output_ptr + x + 4, result.val[1]);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
+
+                if(tmp >= 0)
+                {
+                    tmp >>= n;
+                }
+                else
+                {
+                    uint64_t mask = (1u << n) - 1;
+                    tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
+                }
+                if(is_sat)
+                {
+                    tmp = utility::clamp<int64_t, int32_t>(tmp);
+                }
+                *(output_ptr + x) = static_cast<int32_t>(tmp);
+            }
+        },
+        input1, input2, output);
+    }
+}
+
 void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
 {
     // Create input windows
@@ -723,7 +935,7 @@ void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const
     constexpr int window_step_x         = 16 / sizeof(float);
     const auto    window_start_x        = static_cast<int>(window.x().start());
     const auto    window_end_x          = static_cast<int>(window.x().end());
-    const bool    is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+    const bool    is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
 
@@ -808,33 +1020,137 @@ void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const
     }
 }
 
-void c_mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr)
+void c_mul_F32_F32_F32_n(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    const auto input1 = static_cast<const float *__restrict>(input1_ptr);
-    const auto input2 = static_cast<const float *__restrict>(input2_ptr);
-    const auto output = static_cast<float *__restrict>(output_ptr);
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
 
-    const float32x4_t a = wrapper::vloadq(input1);
-    float32x4_t       b = wrapper::vloadq(input2);
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 8 / sizeof(float);
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
     using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
 
-    const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
-    const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
-    const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
-    const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
-    const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+    if(is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+            const auto output_ptr              = reinterpret_cast<float *>(output.ptr());
+
+            const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto  a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
+                float32x4_t b = vdupq_n_f32(broadcast_value);
+
+                const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
+                const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+                const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+                const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+                const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+
+                const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+                const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+
+                float32x4_t res = wrapper::vmul(tmp0, b);
+                b               = wrapper::vmul(b, mask);
+
+                res = wrapper::vmla(res, tmp1, b);
+                wrapper::vstore(output_ptr + 2 * x, res);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
+                const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
+                auto       res1                 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
+                auto       res2                 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
+                *(output_ptr + 2 * x)           = res1;
+                *(output_ptr + 2 * x + 1)       = res2;
+            }
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
 
-    const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
-    const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
 
-    float32x4_t res = wrapper::vmul(tmp0, b);
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
+                float32x4_t       b = wrapper::vloadq(input2_ptr + 2 * x);
+
+                const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
+                const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+                const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+                const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+                const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+
+                const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+                const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+
+                float32x4_t res = wrapper::vmul(tmp0, b);
 
-    b = wrapper::vrev64(b);
-    b = wrapper::vmul(b, mask);
+                b = wrapper::vrev64(b);
+                b = wrapper::vmul(b, mask);
 
-    res = wrapper::vmla(res, tmp1, b);
-    wrapper::vstore(output, res);
+                res = wrapper::vmla(res, tmp1, b);
+                wrapper::vstore(output_ptr + 2 * x, res);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const auto a0             = *(input1_ptr + 2 * x);
+                const auto a1             = *(input1_ptr + 2 * x + 1);
+                const auto b0             = *(input2_ptr + 2 * x);
+                const auto b1             = *(input2_ptr + 2 * x + 1);
+                auto       res1           = a0 * b0 - a1 * b1;
+                auto       res2           = a0 * b1 + a1 * b0;
+                *(output_ptr + 2 * x)     = res1;
+                *(output_ptr + 2 * x + 1) = res2;
+            }
+        },
+        input1, input2, output);
+    }
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -1200,6 +1516,12 @@ void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo
                 }
             }
             break;
+        case DataType::S32:
+            if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
+            {
+                _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
+            }
+            break;
         case DataType::U8:
             if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
             {
@@ -1291,8 +1613,6 @@ void NEPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window
 }
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration_complex = 2;
-
 Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F32);
@@ -1311,9 +1631,13 @@ Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *
 
     return Status{};
 }
+} // namespace
 
-std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void NEComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output));
+
     const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
     const TensorShape &out_shape    = broadcast_pair.first;
     const ValidRegion &valid_region = broadcast_pair.second;
@@ -1322,43 +1646,19 @@ std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *inp
     const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());
     auto_init_if_empty(*output, out_info);
 
-    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration_complex));
-    Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
-    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_complex);
-    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_complex);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_complex);
-
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    output_access.set_valid_region(win, valid_region);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-void NEComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output));
-
     // Configure kernel window
-    auto win_config = validate_and_configure_window_complex(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(valid_region);
+    Window win = calculate_max_window(valid_region, Steps());
 
-    // Create kernel
-    INEKernel::configure(win_config.second);
+    INEKernel::configure(win);
 }
 
 Status NEComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
 
     return Status{};
 }
@@ -1373,14 +1673,6 @@ void NEComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const
     auto input2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     auto output = tensors.get_tensor(TensorType::ACL_DST);
 
-    Iterator input1_it(input1, window.broadcast_if_dimension_le_one(input1->info()->tensor_shape()));
-    Iterator input2_it(input2, window.broadcast_if_dimension_le_one(input2->info()->tensor_shape()));
-    Iterator output_it(output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        c_mul_F32_F32_F32_n(input1_it.ptr(), input2_it.ptr(), output_it.ptr());
-    },
-    input1_it, input2_it, output_it);
+    c_mul_F32_F32_F32_n(input1, input2, output, window);
 }
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
similarity index 75%
rename from arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
rename to src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index c530d78c42..d414168b2d 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H
 #define ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -55,55 +55,56 @@ class NEPixelWiseMultiplicationKernel : public INEKernel
      *
      * Valid configurations (Input1,Input2) -> Output :
      *
-     *   - (U8,U8)                         -> U8
-     *   - (U8,U8)                         -> S16
-     *   - (U8,S16)                        -> S16
-     *   - (S16,U8)                        -> S16
-     *   - (S16,S16)                       -> S16
-     *   - (F16,F16)                       -> F16
-     *   - (F32,F32)                       -> F32
-     *   - (QASYMM8,QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16)               -> QSYMM16
-     *   - (QSYMM16,QSYMM16)               -> S32
+     *                                                       Support: Broadcast? Scale=1/255?
+     *   - (U8,U8)                         -> U8, S16                 N          Y
+     *   - (U8,S16)                        -> S16                     N          Y
+     *   - (S16,U8)                        -> S16                     N          Y
+     *   - (S16,S16)                       -> S16                     N          Y
+     *   - (S32,S32)                       -> S32                     Y          N
+     *   - (F16,F16)                       -> F16                     N          Y
+     *   - (F32,F32)                       -> F32                     Y          Y
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8                 Y          Y
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED          Y          Y
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16, S32            N          Y
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in]  input1          First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in]  input2          Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[out] output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
+     * @param[in]  input1          First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in]  input2          Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[out] output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
      * @param[in]  scale           Scale to apply after multiplication.
      *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+     *                             If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+     * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
      * @param[in]  rounding_policy Rounding policy.
      */
     void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplicationKernel
      *
      * Valid configurations (Input1,Input2) -> Output :
-     *
-     *   - (U8,U8)                         -> U8
-     *   - (U8,U8)                         -> S16
-     *   - (U8,S16)                        -> S16
-     *   - (S16,U8)                        -> S16
-     *   - (S16,S16)                       -> S16
-     *   - (F16,F16)                       -> F16
-     *   - (F32,F32)                       -> F32
-     *   - (QASYMM8,QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16)               -> QSYMM16
-     *   - (QSYMM16,QSYMM16)               -> S32
+     *                                                       Support: Broadcast? Scale=1/255?
+     *   - (U8,U8)                         -> U8, S16                 N          Y
+     *   - (U8,S16)                        -> S16                     N          Y
+     *   - (S16,U8)                        -> S16                     N          Y
+     *   - (S16,S16)                       -> S16                     N          Y
+     *   - (S32,S32)                       -> S32                     Y          N
+     *   - (F16,F16)                       -> F16                     N          Y
+     *   - (F32,F32)                       -> F32                     Y          Y
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8                 Y          Y
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED          Y          Y
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16, S32            N          Y
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in] input1          First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] input2          Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] output          Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
+     * @param[in] input1          First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in] input2          Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in] output          Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+     *                            If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
      * @param[in] rounding_policy Rounding policy.
      *
      * @return a status
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index efd0affee9..b46843badd 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -21,24 +21,26 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/ToolchainSupport.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include <algorithm>
 #include <arm_neon.h>
 #include <cmath>
@@ -53,6 +55,20 @@ using namespace misc::shape_calculator;
 
 namespace
 {
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
+quantize(float val, const UniformQuantizationInfo &info)
+{
+    return quantize_qasymm8_signed(val, info);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
+quantize(float val, const UniformQuantizationInfo &info)
+{
+    return quantize_qasymm8(val, info);
+}
+
 inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
@@ -215,19 +231,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     num_elems_processed_per_iteration = 1;
     num_elems_horizontal_window       = 1;
 
-    const bool is_nhwc = data_layout == DataLayout::NHWC;
-
     if(is_square)
     {
         switch(input->data_type())
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
-                if(is_nhwc)
-                {
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                }
                 switch(pool_size_x)
                 {
                     case 2:
@@ -246,11 +255,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
                 break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
-                if(is_nhwc)
-                {
-                    num_elems_processed_per_iteration = 8;
-                    break;
-                }
                 switch(pool_size_x)
                 {
                     case 2:
@@ -265,11 +269,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::F32:
-                if(is_nhwc)
-                {
-                    num_elems_processed_per_iteration = 4;
-                    break;
-                }
                 switch(pool_size_x)
                 {
                     case 2:
@@ -292,13 +291,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
                 break;
         }
     }
-    else
-    {
-        if(is_nhwc)
-        {
-            num_elems_processed_per_iteration = 16 / input->element_size();
-        }
-    }
 
     bool   window_changed = false;
     Window win{};
@@ -330,26 +322,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         }
         output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
     }
-    else
-    {
-        TensorShape output_shape{ input->tensor_shape() };
-        output_shape.set(1, pooled_w);
-        output_shape.set(2, pooled_h);
-        TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
-        win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
-        AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        if(indices)
-        {
-            AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
-            window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
-        }
-        else
-        {
-            window_changed = update_window_and_padding(win, input_access, output_access);
-        }
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
@@ -522,207 +494,141 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
 
     if(data_type == DataType::QASYMM8)
     {
-        if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
+        if(!is_nchw)
         {
-            if(is_nchw)
+            _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
+        }
+        else
+        {
+            if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
             {
                 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<uint8_t>;
             }
-            else
-            {
-                _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
-            }
-        }
-        else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
-        {
-            if(is_nchw)
+            else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
             {
                 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<uint8_t>;
             }
             else
-            {
-                _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
-            }
-        }
-        else
-        {
-            if(is_nchw)
             {
                 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<uint8_t>;
             }
-            else
-            {
-                _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<uint8_t>;
-            }
         }
     }
     else if(data_type == DataType::QASYMM8_SIGNED)
     {
-        if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
+        if(!is_nchw)
         {
-            if(is_nchw)
+            _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
+        }
+        else
+        {
+            if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
             {
                 _func = &NEPoolingLayerKernel::pooling2_q8_nchw<int8_t>;
             }
-            else
-            {
-                _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
-            }
-        }
-        else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
-        {
-            if(is_nchw)
+            else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
             {
                 _func = &NEPoolingLayerKernel::pooling3_q8_nchw<int8_t>;
             }
             else
-            {
-                _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
-            }
-        }
-        else
-        {
-            if(is_nchw)
             {
                 _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<int8_t>;
             }
-            else
-            {
-                _func = &NEPoolingLayerKernel::poolingMxN_q8_nhwc<int8_t>;
-            }
         }
     }
     else if(data_type == DataType::F16)
     {
-        if(_is_square)
+        if(!is_nchw)
         {
-            switch(pool_size.x())
+            _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
+        }
+        else
+        {
+            if(_is_square)
             {
-                case 2:
+                switch(pool_size.x())
                 {
-                    if(is_nchw)
+                    case 2:
                     {
                         _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
                     }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
-                    }
-                }
-                break;
-                case 3:
-                {
-                    if(is_nchw)
+                    break;
+                    case 3:
                     {
                         _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
                     }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
-                    }
-                }
-                break;
-                default:
-                {
-                    if(is_nchw)
+                    break;
+                    default:
                     {
                         _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
+                        break;
                     }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
-                    }
-                    break;
                 }
-                break;
-            }
-        }
-        else
-        {
-            if(is_nchw)
-            {
-                _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
             }
             else
             {
-                _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
+                _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
             }
         }
     }
     else if(data_type == DataType::F32)
     {
-        if(_is_square)
+        if(!is_nchw)
         {
-            switch(pool_size.x())
+            _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
+        }
+        else
+        {
+            if(_is_square)
             {
-                case 2:
+                switch(pool_size.x())
                 {
-                    if(is_nchw)
+                    case 2:
                     {
                         _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
+                        break;
                     }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
-                    }
-                    break;
-                }
-                case 3:
-                {
-                    if(is_nchw)
+                    case 3:
                     {
                         _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
+                        break;
                     }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
-                    }
-                    break;
-                }
-                case 7:
-                {
-                    if(is_nchw)
+                    case 7:
                     {
                         _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
+                        break;
                     }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
-                    }
-                    break;
-                }
-                default:
-                {
-                    if(is_nchw)
+                    default:
                     {
                         _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
+                        break;
                     }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
-                    }
-                    break;
                 }
             }
-        }
-        else
-        {
-            if(is_nchw)
-            {
-                _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
-            }
             else
             {
-                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
+                _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
             }
         }
     }
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
-                                                    pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    if(!is_nchw)
+    {
+        // Configure kernel window
+        Window      win = calculate_max_window(*output->info(), Steps());
+        Coordinates coord;
+        coord.set_num_dimensions(output->info()->num_dimensions());
+        output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+        INEKernel::configure(win);
+    }
+    else
+    {
+        // Configure kernel window
+        auto win_config = validate_and_configure_window(input->info(), output->info(), (indices) ? indices->info() : nullptr,
+                                                        pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
+        ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+        INEKernel::configure(win_config.second);
+    }
 }
 
 template <typename T>
@@ -1371,9 +1277,16 @@ void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window)
 {
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 8;
+
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
     Iterator input(_input, window_input);
-    Iterator output(_output, window);
-    Iterator indices(_indices, window);
+    Iterator output(_output, window_out);
+    Iterator indices(_indices, window_out);
 
     const int pool_pad_top  = _pool_info.pad_stride_info.pad_top();
     const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
@@ -1386,7 +1299,7 @@ void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &windo
     const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
     const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window_out, [&](const Coordinates & id)
     {
         const int idx_width    = id.y() * pool_stride_x;
         const int idx_height   = id.z() * pool_stride_y;
@@ -1399,50 +1312,77 @@ void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &windo
                                  (_input->info()->strides_in_bytes().z());
         const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
                                  (_input->info()->strides_in_bytes().z());
-
         const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
                                  (_input->info()->strides_in_bytes().z());
-
         const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
                                  (_input->info()->strides_in_bytes().z());
 
-        const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset);
-        const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset);
-        const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset);
-        const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset);
-        const auto  v_x0      = vld1q_f16(in_x0_ptr);
-        const auto  v_x1      = vld1q_f16(in_x1_ptr);
-        const auto  v_x2      = vld1q_f16(in_x2_ptr);
-        const auto  v_x3      = vld1q_f16(in_x3_ptr);
-        float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
-        // Store result
-        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
-
-        const uint32_t   offset_base    = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
-        const uint32_t   offset_x0      = (uint32_t)offset_base / sizeof(float16_t);
-        const uint32_t   offset_x1      = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
-        const uint32_t   offset_x2      = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
-        const uint32_t   offset_x3      = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
-        const uint32x4_t voffset_x0_0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
-        const uint32x4_t voffset_x0_1   = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
-        const uint16x8_t voffset_x0     = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
-        const uint32x4_t voffset_x1_0   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
-        const uint32x4_t voffset_x1_1   = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
-        const uint16x8_t voffset_x1     = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
-        const uint32x4_t voffset_x2_0   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
-        const uint32x4_t voffset_x2_1   = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
-        const uint16x8_t voffset_x2     = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
-        const uint32x4_t voffset_x3_0   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-        const uint32x4_t voffset_x3_1   = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
-        const uint16x8_t voffset_x3     = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
-        const uint16x8_t tmp_indices0   = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
-        const uint16x8_t tmp_indices1   = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
-        const uint16x8_t tmp_indices2   = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-        const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
-        const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
-        // Store indicies
-        vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()), tmp_indeces3_0);
-        vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16), tmp_indeces3_1);
+        int x_off = window_start_x;
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+        {
+            const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off;
+            const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off;
+            const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off;
+            const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off;
+            const auto  v_x0      = vld1q_f16(in_x0_ptr);
+            const auto  v_x1      = vld1q_f16(in_x1_ptr);
+            const auto  v_x2      = vld1q_f16(in_x2_ptr);
+            const auto  v_x3      = vld1q_f16(in_x3_ptr);
+            float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
+            // Store result
+            vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
+
+            const uint32_t   offset_base    = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+            const uint32_t   offset_x0      = (uint32_t)offset_base / sizeof(float16_t) + x_off;
+            const uint32_t   offset_x1      = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
+            const uint32_t   offset_x2      = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
+            const uint32_t   offset_x3      = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
+            const uint32x4_t voffset_x0_0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
+            const uint32x4_t voffset_x0_1   = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
+            const uint16x8_t voffset_x0     = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
+            const uint32x4_t voffset_x1_0   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
+            const uint32x4_t voffset_x1_1   = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
+            const uint16x8_t voffset_x1     = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
+            const uint32x4_t voffset_x2_0   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
+            const uint32x4_t voffset_x2_1   = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
+            const uint16x8_t voffset_x2     = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
+            const uint32x4_t voffset_x3_0   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
+            const uint32x4_t voffset_x3_1   = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
+            const uint16x8_t voffset_x3     = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
+            const uint16x8_t tmp_indices0   = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
+            const uint16x8_t tmp_indices1   = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
+            const uint16x8_t tmp_indices2   = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+            const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
+            const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
+            // Store indicies
+            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
+            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
+        }
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            const auto x0  = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset) + x_off);
+            const auto x1  = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset) + x_off);
+            const auto x2  = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset) + x_off);
+            const auto x3  = *(reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset) + x_off);
+            float16_t  res = std::max(std::max(x2, x3), std::max(x0, x1));
+
+            // Store result
+            *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
+
+            const uint32_t offset_base = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+            const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
+            const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
+            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
+            const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
+            const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
+            const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
+            const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+
+            // Store indices
+            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
+        }
     },
     input, output, indices);
 }
@@ -1457,8 +1397,15 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
     {
         pooling2_f16_nhwc_maxpool_indices(window_input, window);
     }
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 8;
+
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
     Iterator input(_input, window_input);
-    Iterator output(_output, window);
+    Iterator output(_output, window_out);
 
     const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
     const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
@@ -1474,7 +1421,7 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
 
     float16x8_t vres;
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window_out, [&](const Coordinates & id)
     {
         const int idx_width    = id.y() * pool_stride_x;
         const int idx_height   = id.z() * pool_stride_y;
@@ -1486,60 +1433,121 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
         const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
         const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
 
-        if(pooling_type != PoolingType::MAX)
+        int x_off = window_start_x;
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
         {
-            // Calculate scale
-            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                    pool_stride_y);
-            const float16x8_t scale_v = vdupq_n_f16(scale);
-
-            // Perform pooling
-            vres = vdupq_n_f16(0.0f);
-            for(int y = pool_start_y; y < pool_end_y; ++y)
+            if(pooling_type != PoolingType::MAX)
             {
-                for(int x = pool_start_x; x < pool_end_x; ++x)
-                {
-                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
-                                                                                           (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
+                // Calculate scale
+                const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                        pool_stride_y);
+                const float16x8_t scale_v = vdupq_n_f16(scale);
 
-                    // Get power of 2 in case of l2 pooling and accumulate
-                    if(pooling_type == PoolingType::L2)
+                // Perform pooling
+                vres = vdupq_n_f16(0.0f);
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        vres = vaddq_f16(vres, vmulq_f16(data, data));
+                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
+                                                                                               (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
+
+                        // Get power of 2 in case of l2 pooling and accumulate
+                        if(pooling_type == PoolingType::L2)
+                        {
+                            vres = vaddq_f16(vres, vmulq_f16(data, data));
+                        }
+                        else
+                        {
+                            vres = vaddq_f16(vres, data);
+                        }
                     }
-                    else
+                }
+                // Divide by scale
+                vres = vmulq_f16(vres, scale_v);
+            }
+            else
+            {
+                vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
+
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        vres = vaddq_f16(vres, data);
+                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
+                                                                                               (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())) + x_off);
+                        vres                   = vmaxq_f16(vres, data);
                     }
                 }
             }
-            // Divide by scale
-            vres = vmulq_f16(vres, scale_v);
+
+            // Calculate square-root in case of l2 pooling
+            if(pooling_type == PoolingType::L2)
+            {
+                float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
+                vres                        = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
+            }
+
+            // Store result
+            vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + x_off, vres);
         }
-        else
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
         {
-            vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
+            float16_t res = 0.0f;
 
-            for(int y = pool_start_y; y < pool_end_y; ++y)
+            if(pooling_type != PoolingType::MAX)
             {
-                for(int x = pool_start_x; x < pool_end_x; ++x)
+                // Calculate scale
+                const float16_t scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                            pool_stride_y);
+
+                for(int y = pool_start_y; y < pool_end_y; ++y)
                 {
-                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) +
-                                                                                           (y - pool_pad_top) * static_cast<int>(_input->info()->strides_in_bytes().z())));
-                    vres                   = vmaxq_f16(vres, data);
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const float data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                 (_input->info()->strides_in_bytes().z())) + x_off);
+
+                        // Get power of 2 in case of l2 pooling and accumulate
+                        if(pooling_type == PoolingType::L2)
+                        {
+                            res += data * data;
+                        }
+                        else
+                        {
+                            res += data;
+                        }
+                    }
+                }
+
+                // Divide by scale
+                res *= scale;
+            }
+            else
+            {
+                res = std::numeric_limits<float>::lowest();
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                     (_input->info()->strides_in_bytes().z())) + x_off);
+                        res                  = std::max(res, data);
+                    }
                 }
             }
-        }
 
-        // Calculate square-root in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
-            vres                        = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
-        }
+            // Calculate square-root in case of l2 pooling
+            if(pooling_type == PoolingType::L2)
+            {
+                res = std::sqrt(res);
+            }
 
-        // Store result
-        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
+            // Store result
+            *(reinterpret_cast<float16_t *>(output.ptr()) + x_off) = res;
+        }
     },
     input, output);
 
@@ -1900,8 +1908,15 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
     }
     else
     {
+        const int window_start_x = window.x().start();
+        const int window_end_x   = window.x().end();
+        const int window_step_x  = 4;
+
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
         Iterator input(_input, window_input);
-        Iterator output(_output, window);
+        Iterator output(_output, window_out);
 
         const int pool_size_x     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().y() : _pool_info.pool_size.width;
         const int pool_size_y     = _pool_info.is_global_pooling ? _input->info()->tensor_shape().z() : _pool_info.pool_size.height;
@@ -1917,7 +1932,7 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
 
         float32x4_t vres;
 
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(window_out, [&](const Coordinates & id)
         {
             const int idx_width    = id.y() * pool_stride_x;
             const int idx_height   = id.z() * pool_stride_y;
@@ -1929,64 +1944,125 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
             const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
             const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
 
-            if(pooling_type != PoolingType::MAX)
+            int x_off = window_start_x;
+            for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
             {
-                // Calculate scale
-                const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                        pool_stride_y);
-                const float32x4_t scale_v = vdupq_n_f32(scale);
+                if(pooling_type != PoolingType::MAX)
+                {
+                    // Calculate scale
+                    const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                            pool_stride_y);
+                    const float32x4_t scale_v = vdupq_n_f32(scale);
 
-                // Perform pooling
-                vres = vdupq_n_f32(0.0f);
+                    // Perform pooling
+                    vres = vdupq_n_f32(0.0f);
 
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    for(int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                           (_input->info()->strides_in_bytes().z())));
-
-                        // Get power of 2 in case of l2 pooling and accumulate
-                        if(pooling_type == PoolingType::L2)
+                        for(int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            vres = vmlaq_f32(vres, data, data);
+                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                               (_input->info()->strides_in_bytes().z())) + x_off);
+
+                            // Get power of 2 in case of l2 pooling and accumulate
+                            if(pooling_type == PoolingType::L2)
+                            {
+                                vres = vmlaq_f32(vres, data, data);
+                            }
+                            else
+                            {
+                                vres = vaddq_f32(vres, data);
+                            }
                         }
-                        else
+                    }
+                    // Divide by scale
+                    vres = vmulq_f32(vres, scale_v);
+                }
+                else
+                {
+                    vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
+                    for(int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for(int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            vres = vaddq_f32(vres, data);
+                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                               (_input->info()->strides_in_bytes().z())) + x_off);
+                            vres                   = vmaxq_f32(vres, data);
                         }
                     }
                 }
-                // Divide by scale
-                vres = vmulq_f32(vres, scale_v);
+
+                // Calculate square-root in case of l2 pooling
+                if(pooling_type == PoolingType::L2)
+                {
+                    float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
+                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
+                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
+                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
+                                         };
+                    vres = l2_res;
+                }
+
+                // Store result
+                vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
             }
-            else
+
+            // Left-overs loop
+            for(; x_off < window_end_x; ++x_off)
             {
-                vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                float res = 0.0f;
+
+                if(pooling_type != PoolingType::MAX)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    // Calculate scale
+                    const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                            pool_stride_y);
+
+                    for(int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                           (_input->info()->strides_in_bytes().z())));
-                        vres                   = vmaxq_f32(vres, data);
+                        for(int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                 (_input->info()->strides_in_bytes().z())) + x_off);
+
+                            // Get power of 2 in case of l2 pooling and accumulate
+                            if(pooling_type == PoolingType::L2)
+                            {
+                                res += data * data;
+                            }
+                            else
+                            {
+                                res += data;
+                            }
+                        }
+                    }
+
+                    // Divide by scale
+                    res *= scale;
+                }
+                else
+                {
+                    res = std::numeric_limits<float>::lowest();
+                    for(int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for(int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                 (_input->info()->strides_in_bytes().z())) + x_off);
+                            res              = std::max(res, data);
+                        }
                     }
                 }
-            }
 
-            // Calculate square-root in case of l2 pooling
-            if(pooling_type == PoolingType::L2)
-            {
-                float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
-                                       static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
-                                       static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
-                                       static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
-                                     };
-                vres = l2_res;
-            }
+                // Calculate square-root in case of l2 pooling
+                if(pooling_type == PoolingType::L2)
+                {
+                    res = std::sqrt(res);
+                }
 
-            // Store result
-            vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
+                // Store result
+                *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
+            }
         },
         input, output);
     }
@@ -1994,9 +2070,16 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
 
 void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window)
 {
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 4;
+
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
     Iterator input(_input, window_input);
-    Iterator output(_output, window);
-    Iterator indices(_indices, window);
+    Iterator output(_output, window_out);
+    Iterator indices(_indices, window_out);
 
     const int pool_pad_top  = _pool_info.pad_stride_info.pad_top();
     const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
@@ -2006,12 +2089,13 @@ void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &windo
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
 
     float32x4_t vres;
+    float       res;
 
     const int pad_right   = _input->info()->padding().right;
     const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
     const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window_out, [&](const Coordinates & id)
     {
         const int idx_width    = id.y() * pool_stride_x;
         const int idx_height   = id.z() * pool_stride_y;
@@ -2020,43 +2104,72 @@ void NEPoolingLayerKernel::pooling2_f32_nhwc_maxpool_indices(const Window &windo
 
         const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
         const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
+
         const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
                                  (_input->info()->strides_in_bytes().z());
         const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
                                  (_input->info()->strides_in_bytes().z());
-
         const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
                                  (_input->info()->strides_in_bytes().z());
-
         const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
                                  (_input->info()->strides_in_bytes().z());
 
-        const auto in_x0_ptr = reinterpret_cast<const float *>(input.ptr() + in_x0_offset);
-        const auto in_x1_ptr = reinterpret_cast<const float *>(input.ptr() + in_x1_offset);
-        const auto in_x2_ptr = reinterpret_cast<const float *>(input.ptr() + in_x2_offset);
-        const auto in_x3_ptr = reinterpret_cast<const float *>(input.ptr() + in_x3_offset);
-        const auto v_x0      = vld1q_f32(in_x0_ptr);
-        const auto v_x1      = vld1q_f32(in_x1_ptr);
-        const auto v_x2      = vld1q_f32(in_x2_ptr);
-        const auto v_x3      = vld1q_f32(in_x3_ptr);
-        vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
-        // Store result
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
-
-        const uint32_t   offset_base  = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
-        const uint32_t   offset_x0    = (uint32_t)offset_base / sizeof(float);
-        const uint32_t   offset_x1    = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
-        const uint32_t   offset_x2    = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
-        const uint32_t   offset_x3    = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
-        const uint32x4_t voffset_x0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
-        const uint32x4_t voffset_x1   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
-        const uint32x4_t voffset_x2   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
-        const uint32x4_t voffset_x3   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-        const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
-        const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
-        const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-        // Store indices
-        vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()), tmp_indices2);
+        int x_off = window_start_x;
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+        {
+            const auto in_x0_ptr = reinterpret_cast<const float *>(input.ptr() + in_x0_offset);
+            const auto in_x1_ptr = reinterpret_cast<const float *>(input.ptr() + in_x1_offset);
+            const auto in_x2_ptr = reinterpret_cast<const float *>(input.ptr() + in_x2_offset);
+            const auto in_x3_ptr = reinterpret_cast<const float *>(input.ptr() + in_x3_offset);
+            const auto v_x0      = vld1q_f32(in_x0_ptr + x_off);
+            const auto v_x1      = vld1q_f32(in_x1_ptr + x_off);
+            const auto v_x2      = vld1q_f32(in_x2_ptr + x_off);
+            const auto v_x3      = vld1q_f32(in_x3_ptr + x_off);
+            vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
+            // Store result
+            vst1q_f32(reinterpret_cast<float *>(output.ptr()) + x_off, vres);
+
+            const uint32_t   offset_base  = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+            const uint32_t   offset_x0    = (uint32_t)offset_base / sizeof(float) + x_off;
+            const uint32_t   offset_x1    = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
+            const uint32_t   offset_x2    = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
+            const uint32_t   offset_x3    = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
+            const uint32x4_t voffset_x0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
+            const uint32x4_t voffset_x1   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
+            const uint32x4_t voffset_x2   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
+            const uint32x4_t voffset_x3   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
+            const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
+            const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
+            const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+
+            // Store indices
+            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
+        }
+
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            const auto x0 = *(reinterpret_cast<const float *>(input.ptr() + in_x0_offset) + x_off);
+            const auto x1 = *(reinterpret_cast<const float *>(input.ptr() + in_x1_offset) + x_off);
+            const auto x2 = *(reinterpret_cast<const float *>(input.ptr() + in_x2_offset) + x_off);
+            const auto x3 = *(reinterpret_cast<const float *>(input.ptr() + in_x3_offset) + x_off);
+            res           = std::max(std::max(x2, x3), std::max(x0, x1));
+
+            // Store result
+            *(reinterpret_cast<float *>(output.ptr()) + x_off) = res;
+
+            const uint32_t offset_base = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+            const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float) + x_off;
+            const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
+            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
+            const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
+            const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
+            const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
+            const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+
+            // Store indices
+            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
+        }
     },
     input, output, indices);
 }
@@ -2170,8 +2283,16 @@ void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const
 template <typename T>
 void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
+    const int window_start_x     = window.x().start();
+    const int window_end_x       = window.x().end();
+    const int window_step_x      = 16;
+    const int window_half_step_x = window_step_x / 2;
+
+    Window window_out = window;
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
     Iterator input(_input, window_input);
-    Iterator output(_output, window);
+    Iterator output(_output, window_out);
 
     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
     using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
@@ -2206,7 +2327,7 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
     const int32_t                 requant_offset = output_qinfo.offset - static_cast<int32_t>(static_cast<float>(input_qinfo.offset) / requant_scale);
     const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    execute_window_loop(window_out, [&](const Coordinates & id)
     {
         const int idx_width    = id.y() * pool_stride_x;
         const int idx_height   = id.z() * pool_stride_y;
@@ -2218,83 +2339,178 @@ void NEPoolingLayerKernel::poolingMxN_q8_nhwc(const Window &window_input, const
         const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
         const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
 
-        if(pooling_type != PoolingType::MAX)
+        int x_off = window_start_x;
+        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
         {
-            q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+            if(pooling_type != PoolingType::MAX)
+            {
+                q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
 
-            // Calculate scale
-            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                    pool_stride_y);
+                // Calculate scale
+                const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                        pool_stride_y);
 
-            // Perform pooling
-            for(int y = pool_start_y; y < pool_end_y; ++y)
-            {
-                for(int x = pool_start_x; x < pool_end_x; ++x)
+                // Perform pooling
+                for(int y = pool_start_y; y < pool_end_y; ++y)
                 {
-                    const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                     (_input->info()->strides_in_bytes().z())));
-
-                    const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
-                    const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
-                    vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
-                    vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
-                    vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
-                    vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                         (_input->info()->strides_in_bytes().z())) + x_off);
+
+                        const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
+                        const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+                        vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+                        vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+                        vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+                        vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                    }
                 }
-            }
 
-            if(input_qinfo != output_qinfo)
+                if(input_qinfo != output_qinfo)
+                {
+                    const float32x4x4_t vres =
+                    {
+                        {
+                            vcvtq_f32_q32(vres1),
+                            vcvtq_f32_q32(vres2),
+                            vcvtq_f32_q32(vres3),
+                            vcvtq_f32_q32(vres4),
+                        }
+                    };
+                    const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, wrapper::vgetlow(requantized_output));
+                    wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, wrapper::vgethigh(requantized_output));
+                }
+                else
+                {
+                    const float32x4_t scale_v = vdupq_n_f32(scale);
+                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+                    const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+                    const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, res1);
+                    wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off + 8, res2);
+                }
+            }
+            else
             {
-                const float32x4x4_t vres =
+                q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+
+                for(int y = pool_start_y; y < pool_end_y; ++y)
                 {
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        vcvtq_f32_q32(vres1),
-                        vcvtq_f32_q32(vres2),
-                        vcvtq_f32_q32(vres3),
-                        vcvtq_f32_q32(vres4),
+                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                         (_input->info()->strides_in_bytes().z())) + x_off);
+                        vres               = wrapper::vmax(vres, data);
                     }
-                };
-                const auto requantized_output = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                }
+
                 // Store result
-                wrapper::vstore(reinterpret_cast<T *>(output.ptr()), wrapper::vgetlow(requantized_output));
-                wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, wrapper::vgethigh(requantized_output));
+                wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off, (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
+                                requant_qinfo) :
+                                vres);
             }
-            else
+        }
+
+        if(pooling_type == PoolingType::MAX)
+        {
+            for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
             {
-                const float32x4_t scale_v = vdupq_n_f32(scale);
-                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
-                vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
-                vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
-                vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
-
-                const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
-                const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                                       (_input->info()->strides_in_bytes().z())) + x_off);
+                        vres              = wrapper::vmax(vres, data);
+                    }
+                }
+
                 // Store result
-                wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res1);
-                wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + 8, res2);
+                wrapper::vstore(reinterpret_cast<T *>(output.ptr()) + x_off,
+                                (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
             }
         }
-        else
-        {
-            q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
 
-            for(int y = pool_start_y; y < pool_end_y; ++y)
+        // Left-overs loop
+        for(; x_off < window_end_x; ++x_off)
+        {
+            if(pooling_type != PoolingType::MAX)
             {
-                for(int x = pool_start_x; x < pool_end_x; ++x)
+                q32_t res = static_cast<q32_t>(0.f);
+
+                // Calculate scale
+                const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                        pool_stride_y);
+
+                // Perform pooling
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                     (_input->info()->strides_in_bytes().z())) + x_off);
+                        res += data;
+                    }
+                }
+
+                if(input_qinfo != output_qinfo)
                 {
-                    const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                     (_input->info()->strides_in_bytes().z())));
-                    vres               = wrapper::vmax(vres, data);
+                    const float res_f              = static_cast<float>(res);
+                    const float new_scale          = quant_rescale / scale;
+                    const auto  requantized_output = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+
+                    // Store result
+                    *(reinterpret_cast<T *>(output.ptr()) + x_off) = requantized_output;
+                }
+                else
+                {
+                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                    res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+
+                    // Store result
+                    *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
                 }
             }
+            else
+            {
+                T res = std::numeric_limits<T>::min();
 
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(output.ptr()), (input_qinfo != output_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres), requant_qinfo) : vres);
+                for(int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const T data = *(reinterpret_cast<const T *>(input.ptr() + (x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
+                                                                     (_input->info()->strides_in_bytes().z())) + x_off);
+                        res          = std::max(res, data);
+                    }
+                }
+
+                // Store result
+                if(input_qinfo != output_qinfo)
+                {
+                    const float res_f                              = static_cast<float>(res);
+                    *(reinterpret_cast<T *>(output.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+                }
+                else
+                {
+                    *(reinterpret_cast<T *>(output.ptr()) + x_off) = res;
+                }
+            }
         }
+
     },
     input, output);
 }
@@ -2385,7 +2601,7 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
     }
     else
     {
-        window_input.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), _num_elems_processed_per_iteration));
+        window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
         window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
         window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
     }
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/src/core/NEON/kernels/NEPoolingLayerKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
rename to src/core/NEON/kernels/NEPoolingLayerKernel.h
index 2be25080cd..aa3d2f3f01 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H
 #define ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 808b68a0d7..6757affae8 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h"
+#include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
similarity index 96%
rename from arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h
rename to src/core/NEON/kernels/NEPriorBoxLayerKernel.h
index 84db99100b..430a47f9f8 100644
--- a/arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEPRIORBOXLAYERKERNEL_H
 #define ARM_COMPUTE_NEPRIORBOXLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -48,6 +48,8 @@ class NEPriorBoxLayerKernel : public INEKernel
     NEPriorBoxLayerKernel(NEPriorBoxLayerKernel &&) = default;
     /** Allow instances of this class to be moved */
     NEPriorBoxLayerKernel &operator=(NEPriorBoxLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEPriorBoxLayerKernel() = default;
     /** Set the input and output tensors.
      *
      * @param[in]  input1 First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
index cbfbda71e2..8c1c8cf56b 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
@@ -21,19 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/NESymm.h"
-#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 
 #include <map>
 
diff --git a/arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
similarity index 94%
rename from arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
rename to src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
index 86c9e1d3af..ba68171a59 100644
--- a/arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
 #define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include <functional>
 
 namespace arm_compute
@@ -84,10 +84,19 @@ class NEQLSTMLayerNormalizationKernel : public INEKernel
 
     ComputeFuncType _fn{}; /**< Function pointer to computation function */
 
-    const ITensor *_input{ nullptr };  /**< Input tensor */
-    const ITensor *_weight{ nullptr }; /**< Weight tensor */
-    const ITensor *_bias{ nullptr };   /**< Bias tensor */
-    ITensor       *_output{ nullptr }; /**< Output tensor */
+    const ITensor *_input
+    {
+        nullptr
+    }; /**< Input tensor */
+    const ITensor *_weight
+    {
+        nullptr
+    }; /**< Weight tensor */
+    const ITensor *_bias
+    {
+        nullptr
+    };                           /**< Bias tensor */
+    ITensor *_output{ nullptr }; /**< Output tensor */
 
     int32_t _output_multiplier{}; /**< Multiplier for output values */
     int32_t _output_shift{};      /**< Shift value for output values */
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index 26ba4016e1..ff3d9fff96 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
@@ -21,18 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEQuantizationLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 
 #include <arm_neon.h>
 #include <map>
diff --git a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h b/src/core/NEON/kernels/NEQuantizationLayerKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
rename to src/core/NEON/kernels/NEQuantizationLayerKernel.h
index d35e027ff5..5ee0ed4412 100644
--- a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index 955cdc2074..e937dadba7 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
@@ -21,16 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h"
+#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
@@ -45,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITe
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
@@ -57,7 +59,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITe
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
     }
 
-    if(input->data_type() == DataType::QASYMM8)
+    if(input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
 
@@ -114,7 +116,7 @@ Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorIn
 }
 
 /** Average pooling over an aligned window */
-template <typename input_data_type, DataLayout data_layout>
+template <typename input_data_type>
 inline input_data_type roi_align_1x1(const ITensor *input,
                                      unsigned int   roi_batch,
                                      float          region_start_x,
@@ -133,7 +135,8 @@ inline input_data_type roi_align_1x1(const ITensor *input,
     }
     else
     {
-        float avg = 0;
+        const DataLayout data_layout = input->info()->data_layout();
+        float            avg         = 0;
         // Iterate through the aligned pooling region
         for(int iy = 0; iy < grid_size_y; ++iy)
         {
@@ -183,7 +186,7 @@ inline input_data_type roi_align_1x1(const ITensor *input,
 }
 
 /** Average pooling over an aligned window */
-template <typename input_data_type, DataLayout data_layout>
+template <typename input_data_type>
 inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
                                              unsigned int            roi_batch,
                                              float                   region_start_x,
@@ -203,8 +206,11 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
     }
     else
     {
-        float                         avg         = 0;
-        const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform();
+        float                         avg              = 0;
+        const UniformQuantizationInfo input_qinfo      = input->info()->quantization_info().uniform();
+        const bool                    is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
+        const DataLayout              data_layout      = input->info()->data_layout();
+
         // Iterate through the aligned pooling region
         for(int iy = 0; iy < grid_size_y; ++iy)
         {
@@ -232,26 +238,57 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
 
                 if(data_layout == DataLayout::NCHW)
                 {
-                    float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                    float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                    float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                    float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
-                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    if(is_qasymm_signed)
+                    {
+                        float data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
+                        float data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
+                        float data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
+                        float data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                    else
+                    {
+                        float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
+                        float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
+                        float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
+                        float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
                 }
                 else
                 {
-                    const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                    const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                    const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                    const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
-                    avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    if(is_qasymm_signed)
+                    {
+                        const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
+                        const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
+                        const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
+                        const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
+                    else
+                    {
+                        const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
+                        const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
+                        const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
+                        const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
+                        avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                    }
                 }
             }
         }
 
         avg /= grid_size_x * grid_size_y;
 
-        return quantize_qasymm8(avg, out_qinfo);
+        input_data_type res = 0;
+        if(is_qasymm_signed)
+        {
+            res = quantize_qasymm8_signed(avg, out_qinfo);
+        }
+        else
+        {
+            res = quantize_qasymm8(avg, out_qinfo);
+        }
+        return res;
     }
 }
 
@@ -263,52 +300,30 @@ inline float compute_region_coordinate(int p, float bin_size, float roi_anchor,
 
 void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    const DataLayout data_layout = _input->info()->data_layout();
+    if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
     {
         switch(_input->info()->data_type())
         {
             case DataType::QASYMM8:
             {
-                NEROIAlignLayerKernel::internal_run<DataLayout::NCHW, uint8_t, uint16_t>(window, info);
-                break;
-            }
-            case DataType::F32:
-            {
-                NEROIAlignLayerKernel::internal_run<DataLayout::NCHW, float>(window, info);
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                NEROIAlignLayerKernel::internal_run<DataLayout::NCHW, float16_t>(window, info);
+                NEROIAlignLayerKernel::internal_run<uint8_t, uint16_t>(window, info);
                 break;
             }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            default:
-            {
-                ARM_COMPUTE_ERROR("DataType not supported");
-                break;
-            }
-        }
-    }
-    else if(_input->info()->data_layout() == DataLayout::NHWC)
-    {
-        switch(_input->info()->data_type())
-        {
-            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
             {
-                NEROIAlignLayerKernel::internal_run<DataLayout::NHWC, uint8_t, uint16_t>(window, info);
+                NEROIAlignLayerKernel::internal_run<int8_t, uint16_t>(window, info);
                 break;
             }
             case DataType::F32:
             {
-                NEROIAlignLayerKernel::internal_run<DataLayout::NHWC, float>(window, info);
+                NEROIAlignLayerKernel::internal_run<float>(window, info);
                 break;
             }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
-                NEROIAlignLayerKernel::internal_run<DataLayout::NHWC, float16_t>(window, info);
+                NEROIAlignLayerKernel::internal_run<float16_t>(window, info);
                 break;
             }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -325,21 +340,22 @@ void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info)
     }
 }
 
-template <DataLayout data_layout, typename input_data_type, typename roi_data_type>
+template <typename input_data_type, typename roi_data_type>
 void NEROIAlignLayerKernel::internal_run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const size_t values_per_roi = _rois->info()->dimension(0);
+    const DataLayout data_layout    = _input->info()->data_layout();
+    const size_t     values_per_roi = _rois->info()->dimension(0);
 
     const int roi_list_start = window.x().start();
     const int roi_list_end   = window.x().end();
 
-    const unsigned int idx_width  = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int idx_height = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned int idx_depth  = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int idx_depth  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
     const int input_width   = _input->info()->dimension(idx_width);
     const int input_height  = _input->info()->dimension(idx_height);
@@ -395,14 +411,14 @@ void NEROIAlignLayerKernel::internal_run(const Window &window, const ThreadInfo
                     input_data_type out_val(0);
                     if(is_qasymm)
                     {
-                        out_val = roi_align_1x1_qasymm8<input_data_type, data_layout>(
+                        out_val = roi_align_1x1_qasymm8<input_data_type>(
                                       _input, roi_batch, region_start_x, bin_size_x,
                                       roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
                                       roi_bin_grid_y, region_end_y, ch, _output->info()->quantization_info());
                     }
                     else
                     {
-                        out_val = roi_align_1x1<input_data_type, data_layout>(
+                        out_val = roi_align_1x1<input_data_type>(
                                       _input, roi_batch, region_start_x, bin_size_x,
                                       roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
                                       roi_bin_grid_y, region_end_y, ch);
diff --git a/arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
similarity index 93%
rename from arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h
rename to src/core/NEON/kernels/NEROIAlignLayerKernel.h
index 66ebb5e261..fa31a879b7 100644
--- a/arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEROIALIGNLAYERKERNEL_H
 #define ARM_COMPUTE_NEROIALIGNLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -55,10 +55,10 @@ class NEROIAlignLayerKernel : public INEKernel
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
      *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
-     *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8, otherwise same as @p input
+     *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
      *
@@ -70,8 +70,8 @@ class NEROIAlignLayerKernel : public INEKernel
     void configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEROIAlignLayerKernel
      *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/F16/F32.
-     * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8,
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED,
      *                      otherwise same as @p input
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
@@ -89,7 +89,7 @@ class NEROIAlignLayerKernel : public INEKernel
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    template <DataLayout data_layout, typename input_data_type, typename roi_data_type = input_data_type>
+    template <typename input_data_type, typename roi_data_type = input_data_type>
     void internal_run(const Window &window, const ThreadInfo &info);
 
     const ITensor      *_input;
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 6a960c74dc..40dae828a3 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/ToolchainSupport.h"
 
 #include <cfloat>
diff --git a/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h
rename to src/core/NEON/kernels/NEROIPoolingLayerKernel.h
index fa9685bc6b..36424172a6 100644
--- a/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEROIPOOLINGLAYERKERNEL_H
 #define ARM_COMPUTE_NEROIPOOLINGLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include "arm_compute/core/IArray.h"
 
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index c8a456a427..8d11122ab2 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -21,16 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NERangeKernel.h"
+#include "src/core/NEON/kernels/NERangeKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "arm_compute/core/Utils.h"
 
diff --git a/arm_compute/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NERangeKernel.h
rename to src/core/NEON/kernels/NERangeKernel.h
index 84ebd53b1b..7c42ef11dc 100644
--- a/arm_compute/core/NEON/kernels/NERangeKernel.h
+++ b/src/core/NEON/kernels/NERangeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NERANGEKERNEL_H
 #define ARM_COMPUTE_NERANGEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 5a52216eac..3d105cc60d 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -21,22 +21,24 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/SaturateCast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/SaturateCast.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -45,17 +47,17 @@ namespace
 {
 // Helper function that calls vqmovun/vqmvn, vcombine and vstore, allows templating of RedOpYZW_quantized
 template <typename T>
-void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output)
+void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0)
 {
     if(std::is_same<T, uint8_t>::value)
     {
         auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2));
-        wrapper::vstore(output.ptr(), res);
+        wrapper::vstore(output.ptr() + offset, res);
     }
     else
     {
         auto res = wrapper::vcombine(wrapper::vqmovn(t1), wrapper::vqmovn(t2));
-        wrapper::vstore(reinterpret_cast<int8_t *>(output.ptr()), res);
+        wrapper::vstore(reinterpret_cast<int8_t *>(output.ptr() + offset), res);
     }
 }
 
@@ -342,20 +344,9 @@ class Reducer
     {
         // Set out window
         Window out_window(window);
-        out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-        // Get first input and output slices
-        Window in_slice  = window.first_slice_window_1D();
-        Window out_slice = out_window.first_slice_window_1D();
-
-        do
-        {
-            Iterator in(input, in_slice);
-            Iterator out(output, out_slice);
+        out_window.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-            f(in, out, in_slice, out_slice, *input->info(), op);
-        }
-        while(window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+        f(window, out_window, input, output, op);
     }
     static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
@@ -366,18 +357,7 @@ class Reducer
         in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
         out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1)));
 
-        // Get first input and output slices
-        Window in_slice  = in_window.first_slice_window_2D();
-        Window out_slice = out_window.first_slice_window_2D();
-
-        do
-        {
-            Iterator in(input, in_slice);
-            Iterator out(output, out_slice);
-
-            f(in, out, in_slice, out_slice, *input->info(), 1, op);
-        }
-        while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+        f(in_window, out_window, input, output, 1, op);
     }
     static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
@@ -388,18 +368,7 @@ class Reducer
         in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
         out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2)));
 
-        // Get first input and output slices
-        Window in_slice  = in_window.first_slice_window_3D();
-        Window out_slice = out_window.first_slice_window_3D();
-
-        do
-        {
-            Iterator in(input, in_slice);
-            Iterator out(output, out_slice);
-
-            f(in, out, in_slice, out_slice, *input->info(), 2, op);
-        }
-        while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice));
+        f(in_window, out_window, input, output, 2, op);
     }
     static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
     {
@@ -410,18 +379,7 @@ class Reducer
         in_window.set(3, Window::Dimension(0, 1, 1));
         out_window.set(3, Window::Dimension(0, 1, 1));
 
-        // Get first input and output slices
-        Window in_slice  = in_window.first_slice_window_4D();
-        Window out_slice = out_window.first_slice_window_4D();
-
-        do
-        {
-            Iterator in(input, in_slice);
-            Iterator out(output, out_slice);
-
-            f(in, out, in_slice, out_slice, *input->info(), 3, op);
-        }
-        while(in_window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_4D(out_slice));
+        f(in_window, out_window, input, output, 3, op);
     }
 };
 
@@ -431,385 +389,329 @@ struct RedOpX
     /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
+    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
     {
-        ARM_COMPUTE_UNUSED(out_slice);
-        auto init_res_value = static_cast<T>(0.f);
-        switch(op)
-        {
-            case ReductionOperation::ARG_IDX_MAX:
-            case ReductionOperation::ARG_IDX_MIN:
-            case ReductionOperation::MIN:
-            case ReductionOperation::MAX:
-            {
-                init_res_value = *reinterpret_cast<T *>(input.ptr());
-                break;
-            }
-            case ReductionOperation::PROD:
-            {
-                init_res_value = static_cast<T>(1.f);
-                break;
-            }
-            default:
-                break;
-        }
-        auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
-        uint32x4x4_t vec_res_idx{ { 0 } };
+        const TensorInfo in_info        = *(in->info());
+        const int        window_step_x  = 16 / sizeof(T);
+        const auto       window_start_x = static_cast<int>(in_window.x().start());
+        const auto       window_end_x   = static_cast<int>(in_window.x().end());
 
-        execute_window_loop(in_slice, [&](const Coordinates & id)
+        Window in_win_no_pad = in_window;
+        in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input(in, in_win_no_pad);
+        Iterator output(out, out_window);
+
+        execute_window_loop(in_win_no_pad, [&](const Coordinates &)
         {
-            const auto in_ptr       = reinterpret_cast<const T *>(input.ptr());
-            const auto vec_elements = wrapper::vloadq(in_ptr);
+            const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
 
+            auto init_res_value = static_cast<T>(0.f);
             switch(op)
             {
-                case ReductionOperation::SUM_SQUARE:
-                    vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
-                    break;
-                case ReductionOperation::MEAN_SUM:
-                case ReductionOperation::SUM:
-                    vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
-                    break;
-                case ReductionOperation::PROD:
-                    vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
-                    break;
-                case ReductionOperation::ARG_IDX_MIN:
-                {
-                    auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                    vec_res_idx             = calculate_index<decltype(vec_res_value)>(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                    vec_res_value           = temp_vec_res_value;
-                    break;
-                }
                 case ReductionOperation::ARG_IDX_MAX:
-                {
-                    auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                    vec_res_idx             = calculate_index<decltype(vec_res_value)>(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                    vec_res_value           = temp_vec_res_value;
-                    break;
-                }
+                case ReductionOperation::ARG_IDX_MIN:
                 case ReductionOperation::MIN:
+                case ReductionOperation::MAX:
                 {
-                    vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                    init_res_value = static_cast<T>(*input_ptr);
                     break;
                 }
-                case ReductionOperation::MAX:
+                case ReductionOperation::PROD:
                 {
-                    vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                    init_res_value = static_cast<T>(1.f);
                     break;
                 }
                 default:
-                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
             }
-        },
-        input);
-
-        switch(op)
-        {
-            case ReductionOperation::SUM:
-            case ReductionOperation::SUM_SQUARE:
-            case ReductionOperation::MEAN_SUM:
-            {
-                auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
-                for(int i = 0; i < S / 4; ++i)
-                {
-                    carry_res = wrapper::vpadd(carry_res, carry_res);
-                }
-                auto res = wrapper::vgetlane(carry_res, 0);
-
-                if(op == ReductionOperation::MEAN_SUM)
-                {
-                    res /= in_info.dimension(0);
-                }
+            auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+            uint32x4x4_t vec_res_idx{ { 0 } };
 
-                *(reinterpret_cast<T *>(output.ptr())) = res;
-                break;
-            }
-            case ReductionOperation::PROD:
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
-                T    res       = 1;
-                for(int i = 0; i < S / 2; ++i)
+                const auto vec_elements = wrapper::vloadq(input_ptr + x);
+                switch(op)
                 {
-                    res *= wrapper::vgetlane(carry_res, i);
+                    case ReductionOperation::SUM_SQUARE:
+                        vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                        break;
+                    case ReductionOperation::MEAN_SUM:
+                    case ReductionOperation::SUM:
+                        vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                        break;
+                    case ReductionOperation::PROD:
+                        vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                        break;
+                    case ReductionOperation::ARG_IDX_MIN:
+                    {
+                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    case ReductionOperation::MIN:
+                    {
+                        vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        break;
+                    }
+                    case ReductionOperation::MAX:
+                    {
+                        vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        break;
+                    }
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
                 }
-                *(reinterpret_cast<T *>(output.ptr())) = res;
-                break;
-            }
-            case ReductionOperation::ARG_IDX_MIN:
-            case ReductionOperation::ARG_IDX_MAX:
-            {
-                auto res                                      = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                *(reinterpret_cast<uint32_t *>(output.ptr())) = res;
-                break;
-            }
-            case ReductionOperation::MIN:
-            {
-                *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_min(vec_res_value), 0);
-                break;
-            }
-            case ReductionOperation::MAX:
-            {
-                *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_max(vec_res_value), 0);
-                break;
             }
-            default:
-                ARM_COMPUTE_ERROR("Not supported");
-        }
-    }
-};
-
-template <typename T>
-struct RedOpX_quantized
-{
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, const ReductionOperation op)
-    {
-        ARM_COMPUTE_UNUSED(out_slice);
-
-        using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
-
-        const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
-
-        auto vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-        auto vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-        auto vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-        auto vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-
-        auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
-        auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
-        auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
-        auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
-
-        typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = { 0 };
 
-        if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX)
-        {
-            vec_res_value = wrapper::vdup_n(*reinterpret_cast<T *>(input.ptr()), wrapper::traits::vector_128_tag{});
-        }
-
-        uint32x4x4_t vec_res_idx{ { 0 } };
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto vec_elements = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
             switch(op)
             {
                 case ReductionOperation::SUM:
                 case ReductionOperation::MEAN_SUM:
+                case ReductionOperation::SUM_SQUARE:
                 {
-                    const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                    const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                    const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                    const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                    const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                    const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                    vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
-                    vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
-                    vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
-                    vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                    auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                    for(int i = 0; i < S / 4; ++i)
+                    {
+                        carry_res = wrapper::vpadd(carry_res, carry_res);
+                    }
+                    auto res = wrapper::vgetlane(carry_res, 0);
+
+                    if(op == ReductionOperation::SUM_SQUARE)
+                    {
+                        // Compute left-over elements
+                        for(; x < window_end_x; ++x)
+                        {
+                            res += (*(input_ptr + x)) * (*(input_ptr + x));
+                        }
+                    }
+                    else
+                    {
+                        // Compute left-over elements
+                        for(; x < window_end_x; ++x)
+                        {
+                            res += *(input_ptr + x);
+                        }
+                    }
+
+                    if(op == ReductionOperation::MEAN_SUM)
+                    {
+                        res /= in_info.dimension(0);
+                    }
+
+                    *(reinterpret_cast<T *>(output.ptr())) = res;
                     break;
                 }
                 case ReductionOperation::PROD:
                 {
-                    const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
-                    const auto scale32x4f_4  = vdupq_n_f32(iq_info.scale);
-
-                    const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                    const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                    const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                    const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                    const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                    const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                    auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
-                    auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
-                    auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
-                    auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
-
-                    //de-quantize vec_elements
-                    temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
-                    temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
-                    temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
-                    temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
-                    vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
-                    vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
-                    vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
-                    vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
+                    auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                    T    res       = 1;
+                    for(int i = 0; i < S / 2; ++i)
+                    {
+                        res *= wrapper::vgetlane(carry_res, i);
+                    }
+
+                    // Compute left-over elements
+                    for(; x < window_end_x; ++x)
+                    {
+                        res *= *(input_ptr + x);
+                    }
+
+                    *(reinterpret_cast<T *>(output.ptr())) = res;
                     break;
                 }
                 case ReductionOperation::ARG_IDX_MIN:
                 {
-                    auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                    vec_res_idx             = calculate_index_quantized(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                    vec_res_value           = temp_vec_res_value;
+                    auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                    // Compute left-over elements
+                    for(; x < window_end_x; ++x)
+                    {
+                        if(*(input_ptr + x) < res)
+                        {
+                            idx = x;
+                            res = *(input_ptr + x);
+                        }
+                    }
+                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
                     break;
                 }
                 case ReductionOperation::ARG_IDX_MAX:
                 {
-                    auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                    vec_res_idx             = calculate_index_quantized(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                    vec_res_value           = temp_vec_res_value;
+                    auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                    // Compute left-over elements
+                    for(; x < window_end_x; ++x)
+                    {
+                        if(*(input_ptr + x) > res)
+                        {
+                            idx = x;
+                            res = *(input_ptr + x);
+                        }
+                    }
+                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
                     break;
                 }
                 case ReductionOperation::MIN:
                 {
-                    vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                    // Compute left-over elements
+                    for(; x < window_end_x; ++x)
+                    {
+                        res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
+                    }
+                    *(reinterpret_cast<T *>(output.ptr())) = res;
                     break;
                 }
                 case ReductionOperation::MAX:
                 {
-                    vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                    // Compute left-over elements
+                    for(; x < window_end_x; ++x)
+                    {
+                        res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+                    }
+                    *(reinterpret_cast<T *>(output.ptr())) = res;
                     break;
                 }
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
             }
         },
-        input);
+        input, output);
+    }
+};
 
-        switch(op)
-        {
-            case ReductionOperation::ARG_IDX_MIN:
-            case ReductionOperation::ARG_IDX_MAX:
-            {
-                auto res                                          = calculate_vector_index_quantized(vec_res_idx, vec_res_value, op);
-                *(reinterpret_cast<PromotedType *>(output.ptr())) = res;
-                break;
-            }
-            case ReductionOperation::MIN:
-            {
-                *(output.ptr()) = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-                break;
-            }
-            case ReductionOperation::MAX:
-            {
-                *(output.ptr()) = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-                break;
-            }
-            case ReductionOperation::PROD:
-            {
-                auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
-                carry_res      = wrapper::vmul(carry_res, vec_res_value3_f);
-                carry_res      = wrapper::vmul(carry_res, vec_res_value4_f);
+template <typename T>
+struct RedOpX_quantized
+{
+    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+    {
+        using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
 
-                float res = wrapper::vgetlane(carry_res, 0);
-                res *= wrapper::vgetlane(carry_res, 1);
-                res *= wrapper::vgetlane(carry_res, 2);
-                res *= wrapper::vgetlane(carry_res, 3);
+        const TensorInfo              in_info = *(in->info());
+        const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
 
-                //re-quantize result
-                if(std::is_same<T, uint8_t>::value)
-                {
-                    res = quantize_qasymm8(res, iq_info);
-                }
-                else
-                {
-                    res = quantize_qasymm8_signed(res, iq_info);
-                }
+        const int  window_step_x  = 16 / sizeof(T);
+        const auto window_start_x = static_cast<int>(in_window.x().start());
+        const auto window_end_x   = static_cast<int>(in_window.x().end());
 
-                *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
-                break;
-            }
-            default:
-            {
-                auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
-                carry_res      = wrapper::vadd(carry_res, vec_res_value3);
-                carry_res      = wrapper::vadd(carry_res, vec_res_value4);
+        Window in_win_no_pad = in_window;
+        in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-                auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
-                carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
-                auto res             = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
+        Iterator input(in, in_win_no_pad);
+        Iterator output(out, out_window);
 
-                if(op == ReductionOperation::MEAN_SUM)
-                {
-                    res /= static_cast<int32_t>(in_info.dimension(0));
-                }
-                else
-                {
-                    // Subtract accumulated offsets
-                    res -= (in_info.dimension(0) - 1) * iq_info.offset;
-                }
-                *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
-            }
-        }
-    }
-};
+        execute_window_loop(in_win_no_pad, [&](const Coordinates &)
+        {
+            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
 
-template <typename T, int S>
-struct RedOpYZW
-{
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-    using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
+            auto vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+            auto vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+            auto vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+            auto vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
 
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
-    {
-        ARM_COMPUTE_UNUSED(out_slice);
+            auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
+            auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
+            auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
+            auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
 
-        execute_window_loop(in_slice, [&](const Coordinates &)
-        {
-            neon_vector vec_res_value = { 0 };
-            switch(op)
+            typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = { 0 };
+
+            if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX)
             {
-                case ReductionOperation::ARG_IDX_MAX:
-                case ReductionOperation::ARG_IDX_MIN:
-                case ReductionOperation::MIN:
-                case ReductionOperation::MAX:
-                {
-                    vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
-                    break;
-                }
-                case ReductionOperation::PROD:
-                {
-                    vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
-                    break;
-                }
-                default:
-                {
-                    vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-                    break;
-                }
+                vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
             }
-            uint32x4x4_t vec_res_idx{ { 0 } };
 
-            for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+            uint32x4x4_t vec_res_idx{ { 0 } };
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                const T   *in_ptr       = reinterpret_cast<T *>(input.ptr() + in_info.strides_in_bytes()[axis] * dim);
-                const auto vec_elements = wrapper::vloadq(in_ptr);
+                const auto vec_elements = wrapper::vloadq(input_ptr + x);
                 switch(op)
                 {
                     case ReductionOperation::SUM:
                     case ReductionOperation::MEAN_SUM:
-                        vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
-                        break;
-                    case ReductionOperation::SUM_SQUARE:
-                        vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
-                        break;
-                    case ReductionOperation::PROD:
-                        vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
-                        break;
-                    case ReductionOperation::ARG_IDX_MIN:
-                    {
-                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                        vec_res_value           = temp_vec_res_value;
-                        break;
-                    }
-                    case ReductionOperation::ARG_IDX_MAX:
                     {
-                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                        vec_res_value           = temp_vec_res_value;
+                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                        vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                        vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                        vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                        vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
                         break;
                     }
-                    case ReductionOperation::MIN:
+                    case ReductionOperation::PROD:
                     {
-                        vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        break;
-                    }
-                    case ReductionOperation::MAX:
+                        const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
+                        const auto scale32x4f_4  = vdupq_n_f32(iq_info.scale);
+
+                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                        auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+                        auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+                        auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+                        auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+                        //de-quantize vec_elements
+                        temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+                        temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+                        temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+                        temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+                        vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
+                        vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
+                        vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
+                        vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MIN:
+                    {
+                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                        vec_res_value           = temp_vec_res_value;
+                        break;
+                    }
+                    case ReductionOperation::MIN:
+                    {
+                        vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        break;
+                    }
+                    case ReductionOperation::MAX:
                     {
                         vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
                         break;
@@ -819,25 +721,354 @@ struct RedOpYZW
                 }
             }
 
-            if(op == ReductionOperation::MEAN_SUM)
+            switch(op)
             {
-                auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
-                vec_res_value      = wrapper::vmul(vec_res_value, vec_width_inv);
+                case ReductionOperation::ARG_IDX_MIN:
+                {
+                    auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                    // Compute left-over elements
+                    for(; x < window_end_x; ++x)
+                    {
+                        if(*(input_ptr + x) < res)
+                        {
+                            idx = x;
+                            res = *(input_ptr + x);
+                        }
+                    }
+                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                    break;
+                }
+                case ReductionOperation::ARG_IDX_MAX:
+                {
+                    auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                    // Compute left-over elements
+                    for(; x < window_end_x; ++x)
+                    {
+                        if(*(input_ptr + x) > res)
+                        {
+                            idx = x;
+                            res = *(input_ptr + x);
+                        }
+                    }
+                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                    break;
+                }
+                case ReductionOperation::MIN:
+                {
+                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                    // Compute left-over elements
+                    for(; x < window_end_x; ++x)
+                    {
+                        res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
+                    }
+                    *(reinterpret_cast<T *>(output.ptr())) = res;
+                    break;
+                }
+                case ReductionOperation::MAX:
+                {
+                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                    // Compute left-over elements
+                    for(; x < window_end_x; ++x)
+                    {
+                        res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+                    }
+                    *(reinterpret_cast<T *>(output.ptr())) = res;
+                    break;
+                }
+                case ReductionOperation::PROD:
+                {
+                    auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
+                    carry_res      = wrapper::vmul(carry_res, vec_res_value3_f);
+                    carry_res      = wrapper::vmul(carry_res, vec_res_value4_f);
+
+                    float res = wrapper::vgetlane(carry_res, 0);
+                    res *= wrapper::vgetlane(carry_res, 1);
+                    res *= wrapper::vgetlane(carry_res, 2);
+                    res *= wrapper::vgetlane(carry_res, 3);
+
+                    // Compute left-over elements
+                    for(; x < window_end_x; ++x)
+                    {
+                        //de-quantize input
+                        if(std::is_same<T, uint8_t>::value)
+                        {
+                            res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
+                        }
+                        else
+                        {
+                            res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
+                        }
+                    }
+
+                    //re-quantize result
+                    if(std::is_same<T, uint8_t>::value)
+                    {
+                        res = quantize_qasymm8(res, iq_info);
+                    }
+                    else
+                    {
+                        res = quantize_qasymm8_signed(res, iq_info);
+                    }
+
+                    *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
+                    break;
+                }
+                case ReductionOperation::SUM:
+                case ReductionOperation::MEAN_SUM:
+                {
+                    auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+                    carry_res      = wrapper::vadd(carry_res, vec_res_value3);
+                    carry_res      = wrapper::vadd(carry_res, vec_res_value4);
+
+                    auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+                    carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
+                    auto res             = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
+
+                    // Compute left-over elements
+                    for(; x < window_end_x; ++x)
+                    {
+                        res += *(input_ptr + x);
+                    }
+
+                    if(op == ReductionOperation::MEAN_SUM)
+                    {
+                        res /= static_cast<int32_t>(in_info.dimension(0));
+                    }
+                    else
+                    {
+                        // Subtract accumulated offsets
+                        res -= (in_info.dimension(0) - 1) * iq_info.offset;
+                    }
+                    *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
             }
+        },
+        input, output);
+    }
+};
 
-            if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+template <typename T, int S>
+struct RedOpYZW
+{
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+    using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
+
+    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
+    {
+        const TensorInfo in_info            = *(in->info());
+        const int        window_step_x      = 16 / sizeof(T);
+        const auto       window_start_x_tmp = static_cast<int>(in_window.x().start());
+        const auto       window_end_x_tmp   = static_cast<int>(in_window.x().end());
+        // As it split over x-axis, need to set the correct spiltted window start and end.
+        const auto window_start_x = static_cast<int>(0);
+        const auto window_end_x   = static_cast<int>(in_window.shape().x());
+
+        Window in_win_no_pad = in_window;
+        in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
+        Window out_win_no_pad = out_window;
+        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+
+        Iterator input(in, in_win_no_pad);
+        Iterator output(out, out_win_no_pad);
+
+        execute_window_loop(in_win_no_pad, [&](const Coordinates &)
+        {
+            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                if(std::is_same<T, float16_t>::value)
+                neon_vector vec_res_value = { 0 };
+                switch(op)
+                {
+                    case ReductionOperation::ARG_IDX_MAX:
+                    case ReductionOperation::ARG_IDX_MIN:
+                    case ReductionOperation::MIN:
+                    case ReductionOperation::MAX:
+                    {
+                        vec_res_value = wrapper::vloadq(input_ptr + x);
+                        break;
+                    }
+                    case ReductionOperation::PROD:
+                    {
+                        vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+                        break;
+                    }
+                    default:
+                    {
+                        vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                        break;
+                    }
+                }
+                uint32x4x4_t vec_res_idx{ { 0 } };
+
+                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                {
+                    const T   *in_ptr       = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+                    const auto vec_elements = wrapper::vloadq(in_ptr);
+                    switch(op)
+                    {
+                        case ReductionOperation::SUM:
+                        case ReductionOperation::MEAN_SUM:
+                            vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                            break;
+                        case ReductionOperation::SUM_SQUARE:
+                            vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                            break;
+                        case ReductionOperation::PROD:
+                            vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                            break;
+                        case ReductionOperation::ARG_IDX_MIN:
+                        {
+                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                            vec_res_value           = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                            vec_res_value           = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        {
+                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            break;
+                        }
+                        case ReductionOperation::MAX:
+                        {
+                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
+                    }
+                }
+
+                if(op == ReductionOperation::MEAN_SUM)
                 {
-                    wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vec_res_idx.val[1]);
+                    auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
+                    vec_res_value      = wrapper::vmul(vec_res_value, vec_width_inv);
                 }
+
+                if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+                {
+                    wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                    if(std::is_same<T, float16_t>::value)
+                    {
+                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
+                    }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                }
+                else
+                {
+                    wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
+                }
             }
-            else
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
             {
-                wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
+                auto res_value = 0.f;
+                switch(op)
+                {
+                    case ReductionOperation::ARG_IDX_MAX:
+                    case ReductionOperation::ARG_IDX_MIN:
+                    case ReductionOperation::MIN:
+                    case ReductionOperation::MAX:
+                    {
+                        res_value = *(input_ptr + x);
+                        break;
+                    }
+                    case ReductionOperation::PROD:
+                    {
+                        res_value = static_cast<T>(1.f);
+                        break;
+                    }
+                    default:
+                    {
+                        res_value = static_cast<T>(0.f);
+                        break;
+                    }
+                }
+
+                uint32_t res_idx = 0;
+                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                {
+                    const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+
+                    switch(op)
+                    {
+                        case ReductionOperation::SUM:
+                        case ReductionOperation::MEAN_SUM:
+                            res_value += *in_ptr;
+                            break;
+                        case ReductionOperation::SUM_SQUARE:
+                            res_value += *in_ptr * *in_ptr;
+                            break;
+                        case ReductionOperation::PROD:
+                            res_value *= *in_ptr;
+                            break;
+                        case ReductionOperation::ARG_IDX_MIN:
+                        {
+                            if(*in_ptr < res_value)
+                            {
+                                res_value = *in_ptr;
+                                res_idx   = dim;
+                            }
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            if(*in_ptr > res_value)
+                            {
+                                res_value = *in_ptr;
+                                res_idx   = dim;
+                            }
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        {
+                            res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                            break;
+                        }
+                        case ReductionOperation::MAX:
+                        {
+                            res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
+                    }
+                }
+
+                if(op == ReductionOperation::MEAN_SUM)
+                {
+                    res_value /= in_info.dimension(axis);
+                }
+
+                if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+                {
+                    *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
+                }
+                else
+                {
+                    *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
+                }
             }
         },
         input, output);
@@ -851,51 +1082,73 @@ struct RedOpYZW_complex
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
     using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
 
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int, const ReductionOperation)
+    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
     {
-        ARM_COMPUTE_UNUSED(out_slice);
         ARM_COMPUTE_ERROR_ON(axis != 2);
-
-        const size_t stride_z = in_info.strides_in_bytes()[axis];
-
-        execute_window_loop(in_slice, [&](const Coordinates &)
+        ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM);
+
+        const TensorInfo in_info            = *(in->info());
+        const size_t     stride_z           = in_info.strides_in_bytes()[axis];
+        const int        window_step_x      = 16 / sizeof(T);
+        const auto       window_start_x_tmp = static_cast<int>(in_window.x().start());
+        const auto       window_end_x_tmp   = static_cast<int>(in_window.x().end());
+        // As it split over x-axis, need to set the correct spiltted window start and end.
+        const auto window_start_x = static_cast<int>(0);
+        const auto window_end_x   = static_cast<int>(in_window.shape().x());
+
+        Window in_win_no_pad = in_window;
+        in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
+        Window out_win_no_pad = out_window;
+        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+
+        Iterator input(in, in_win_no_pad);
+        Iterator output(out, out_win_no_pad);
+
+        execute_window_loop(in_win_no_pad, [&](const Coordinates &)
         {
-            neon_vector vec_res_value_0 = { 0 };
-            neon_vector vec_res_value_1 = { 0 };
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                neon_vector vec_res_value_0 = { 0 };
+                neon_vector vec_res_value_1 = { 0 };
 
-            vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-            vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
 
-            for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
-            {
-                T *in_ptr_0;
-                T *in_ptr_1;
-                switch(axis)
+                T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
                 {
-                    case 2:
-                        in_ptr_0 = reinterpret_cast<T *>(input.ptr() + stride_z * dim);
-                        in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 16 + stride_z * dim);
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
-                }
-                const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
-                const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
+                    T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+                    T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
 
-                switch(op)
-                {
-                    case ReductionOperation::SUM:
-                        vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
-                        vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
+                    const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
+                    const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
+
+                    vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
+                    vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
                 }
+
+                wrapper::vstore(out_ptr, vec_res_value_0);
+                wrapper::vstore(out_ptr + 4, vec_res_value_1);
             }
 
-            wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value_0);
-            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + 16), vec_res_value_1);
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                auto res_value_0 = 0.f;
+                auto res_value_1 = 0.f;
 
+                T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                {
+                    T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+                    res_value_0 += *in_ptr;
+                    res_value_1 += *(in_ptr + 1);
+                }
+                *out_ptr       = res_value_0;
+                *(out_ptr + 1) = res_value_1;
+            }
         },
         input, output);
     }
@@ -904,184 +1157,346 @@ struct RedOpYZW_complex
 template <typename T>
 struct RedOpYZW_quantized
 {
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis, const ReductionOperation op)
+    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
     {
-        ARM_COMPUTE_UNUSED(out_slice);
+        const TensorInfo              in_info = *(in->info());
+        const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
+        using PromotedType                    = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
 
-        using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
+        const int  window_step_x      = 16 / sizeof(T);
+        const auto window_start_x_tmp = static_cast<int>(in_window.x().start());
+        const auto window_end_x_tmp   = static_cast<int>(in_window.x().end());
+        // As it split over x-axis, need to set the correct spiltted window start and end.
+        const auto window_start_x = static_cast<int>(0);
+        const auto window_end_x   = static_cast<int>(in_window.shape().x());
 
-        const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
+        Window in_win_no_pad = in_window;
+        in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
+        Window out_win_no_pad = out_window;
+        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+
+        Iterator input(in, in_win_no_pad);
+        Iterator output(out, out_win_no_pad);
 
-        execute_window_loop(in_slice, [&](const Coordinates &)
+        execute_window_loop(in_win_no_pad, [&](const Coordinates &)
         {
-            uint32x4x4_t vec_res_idx{ { 0 } };
-            auto         vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-            auto         vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-            auto         vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-            auto         vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
 
-            auto vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-            auto vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-            auto vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-            auto vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                uint32x4x4_t vec_res_idx{ { 0 } };
+                auto         vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                auto         vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                auto         vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                auto         vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
 
-            auto vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
+                auto vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                auto vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                auto vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                auto vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+
+                auto vec_res_value = wrapper::vloadq(input_ptr + x);
+
+                for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
+                {
+                    const T   *in_ptr       = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
+                    const auto vec_elements = wrapper::vloadq(in_ptr);
+                    switch(op)
+                    {
+                        case ReductionOperation::SUM:
+                        case ReductionOperation::MEAN_SUM:
+                        {
+                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                            vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                            vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                            vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                            vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
+                            const auto scale32x4f_4  = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
+
+                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                            auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+                            auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+                            auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+                            auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+                            //de-quantize vec_elements
+                            temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+                            vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
+                            vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
+                            vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
+                            vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MIN:
+                        {
+                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            vec_res_idx             = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                            vec_res_value           = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            vec_res_idx             = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                            vec_res_value           = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        {
+                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            break;
+                        }
+                        case ReductionOperation::MAX:
+                        {
+                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
+                    }
+                }
 
-            for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
-            {
-                const T   *in_ptr       = reinterpret_cast<T *>(input.ptr()) + in_info.strides_in_bytes()[axis] * index_dim;
-                const auto vec_elements = wrapper::vloadq(in_ptr);
                 switch(op)
                 {
+                    case ReductionOperation::ARG_IDX_MIN:
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
+                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
+                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
+                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]);
+                        break;
+                    }
+                    case ReductionOperation::MIN:
+                    case ReductionOperation::MAX:
+                    {
+                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
+                        break;
+                    }
                     case ReductionOperation::SUM:
-                    case ReductionOperation::MEAN_SUM:
                     {
-                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+                        // Subtract offsets
+                        auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
 
-                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+                        auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
+                        auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
+                        auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
+                        auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
 
-                        vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
-                        vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
-                        vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
-                        vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                        vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
+                        vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
+                        vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
+                        vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
+
+                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
+                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
+
+                        combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
+                        break;
+                    }
+                    case ReductionOperation::MEAN_SUM:
+                    {
+                        const auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<float>(in_info.dimension(axis)), wrapper::traits::vector_128_tag{}));
+                        vec_res_value1_f         = wrapper::vmul(wrapper::vcvt<float>(vec_res_value1), vec_width_inv);
+                        vec_res_value2_f         = wrapper::vmul(wrapper::vcvt<float>(vec_res_value2), vec_width_inv);
+                        vec_res_value3_f         = wrapper::vmul(wrapper::vcvt<float>(vec_res_value3), vec_width_inv);
+                        vec_res_value4_f         = wrapper::vmul(wrapper::vcvt<float>(vec_res_value4), vec_width_inv);
+
+                        vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
+                        vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
+                        vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
+                        vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
+
+                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                        auto       res         = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+
+                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
                         break;
                     }
                     case ReductionOperation::PROD:
                     {
                         const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
-                        const auto scale32x4f_4  = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
+                        const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
 
-                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+                        //re-quantize
+                        vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
+                        vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
+                        vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
+                        vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
 
-                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+                        vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
+                        vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
+                        vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
+                        vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
 
-                        auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
-                        auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
-                        auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
-                        auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                        auto       res         = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
 
-                        //de-quantize vec_elements
-                        temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
-                        vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
-                        vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
-                        vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
-                        vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
-                        break;
-                    }
-                    case ReductionOperation::ARG_IDX_MIN:
-                    {
-                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                        vec_res_value           = temp_vec_res_value;
+                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
                         break;
                     }
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
+                }
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                float res_value = 0.f;
+                switch(op)
+                {
                     case ReductionOperation::ARG_IDX_MAX:
+                    case ReductionOperation::ARG_IDX_MIN:
+                    case ReductionOperation::MIN:
+                    case ReductionOperation::MAX:
                     {
-                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                        vec_res_value           = temp_vec_res_value;
+                        res_value = *(input_ptr + x);
                         break;
                     }
-                    case ReductionOperation::MIN:
+                    case ReductionOperation::PROD:
                     {
-                        vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        res_value = static_cast<T>(1.0f);
                         break;
                     }
-                    case ReductionOperation::MAX:
+                    default:
                     {
-                        vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        res_value = static_cast<T>(0.0f);
                         break;
                     }
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
                 }
-            }
+                uint32_t res_idx = 0;
 
-            if(op == ReductionOperation::MEAN_SUM)
-            {
-                const auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<float>(in_info.dimension(axis)), wrapper::traits::vector_128_tag{}));
-                vec_res_value1_f         = wrapper::vmul(wrapper::vcvt<float>(vec_res_value1), vec_width_inv);
-                vec_res_value2_f         = wrapper::vmul(wrapper::vcvt<float>(vec_res_value2), vec_width_inv);
-                vec_res_value3_f         = wrapper::vmul(wrapper::vcvt<float>(vec_res_value3), vec_width_inv);
-                vec_res_value4_f         = wrapper::vmul(wrapper::vcvt<float>(vec_res_value4), vec_width_inv);
-
-                vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
-                vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
-                vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
-                vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
-            }
-            else if(op == ReductionOperation::PROD)
-            {
-                const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
-                const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
-
-                //re-quantize
-                vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
-                vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
-                vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
-                vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
-
-                vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
-                vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
-                vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
-                vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
-            }
-
-            if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
-            {
-                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()), vec_res_idx.val[0]);
-                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vec_res_idx.val[1]);
-                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vec_res_idx.val[2]);
-                wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vec_res_idx.val[3]);
-            }
-            else if(op == ReductionOperation::MIN || op == ReductionOperation::MAX)
-            {
-                wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
-            }
-            else
-            {
-                if(op == ReductionOperation::SUM)
+                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
                 {
-                    // Subtract offsets
-                    auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
-
-                    auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
-                    auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
-                    auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
-                    auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
-
-                    vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
-                    vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
-                    vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
-                    vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
-
-                    const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
-                    const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
-
-                    combine_and_store<T>(temp16x8t_1, temp16x8t_2, output);
+                    const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
+                    switch(op)
+                    {
+                        case ReductionOperation::SUM:
+                        case ReductionOperation::MEAN_SUM:
+                        {
+                            res_value += *in_ptr;
+                            break;
+                        }
+                        case ReductionOperation::SUM_SQUARE:
+                        {
+                            res_value += *in_ptr * *in_ptr;
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            //de-quantize input
+                            if(std::is_same<T, uint8_t>::value)
+                            {
+                                res_value *= dequantize_qasymm8(*in_ptr, iq_info);
+                            }
+                            else
+                            {
+                                res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
+                            }
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MIN:
+                        {
+                            if(*in_ptr < res_value)
+                            {
+                                res_value = *in_ptr;
+                                res_idx   = dim;
+                            }
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            if(*in_ptr > res_value)
+                            {
+                                res_value = *in_ptr;
+                                res_idx   = dim;
+                            }
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        {
+                            res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                            break;
+                        }
+                        case ReductionOperation::MAX:
+                        {
+                            res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
+                    }
                 }
-                else
-                {
-                    const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
-                    const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
-                    auto       res         = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
 
-                    wrapper::vstore(reinterpret_cast<T *>(output.ptr()), res);
+                switch(op)
+                {
+                    case ReductionOperation::MEAN_SUM:
+                    {
+                        int32_t res = static_cast<int32_t>(res_value);
+                        res /= static_cast<int32_t>(in_info.dimension(axis));
+                        *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
+                        break;
+                    }
+                    case ReductionOperation::SUM:
+                    {
+                        // Subtract accumulated offsets
+                        res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
+                        *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
+                        break;
+                    }
+                    case ReductionOperation::PROD:
+                    {
+                        //re-quantize result
+                        T res = 0;
+                        if(std::is_same<T, uint8_t>::value)
+                        {
+                            res = quantize_qasymm8(res_value, iq_info);
+                        }
+                        else
+                        {
+                            res = quantize_qasymm8_signed(res_value, iq_info);
+                        }
+                        *(reinterpret_cast<T *>(output.ptr() + x)) = res;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MIN:
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
+                        break;
+                    }
+                    default:
+                        *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
                 }
             }
-
         },
         input, output);
     }
@@ -1235,41 +1650,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 
     return Status{};
 }
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis, ReductionOperation op)
-{
-    // Calculate output shape and set if empty
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
-
-    // Output auto initialization if not yet initialized
-    const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
-    DataType   output_data_type = is_arg_min_max ? DataType::S32 : input->data_type();
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
-
-    unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-
-    return std::make_tuple(err, win);
-}
 } // namespace
 
 NEReductionOperationKernel::NEReductionOperationKernel()
-    : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE), _border_size()
-{
-}
-
-BorderSize NEReductionOperationKernel::border_size() const
+    : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE)
 {
-    return _border_size;
 }
 
 void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
@@ -1278,26 +1663,30 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
-    unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
-
     _input          = input;
     _output         = output;
-    _border_size    = (axis == 0) ? BorderSize(0, num_elems_processed_per_iteration - (input->info()->dimension(0) % num_elems_processed_per_iteration), 0, 0) : BorderSize();
     _op             = op;
     _reduction_axis = axis;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+    Coordinates coord;
+    coord.set_num_dimensions(input->info()->num_dimensions());
+    input->info()->set_valid_region(ValidRegion(coord, input->info()->tensor_shape()));
+    Window win = calculate_max_window(*input->info(), Steps());
+    INEKernel::configure(win);
 
-    INEKernel::configure(std::get<1>(win_config));
+    // Calculate output shape and set if empty
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+    // Output auto initialization if not yet initialized
+    const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
+    DataType   output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    output->info()->set_valid_region(ValidRegion(coord, output_shape));
 }
 
 Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
 
     return Status{};
 }
diff --git a/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
rename to src/core/NEON/kernels/NEReductionOperationKernel.h
index 180697f364..dfc105adae 100644
--- a/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H
 #define ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -81,14 +81,12 @@ class NEReductionOperationKernel : public INEKernel
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
 
 private:
     const ITensor     *_input;
     ITensor           *_output;
     unsigned int       _reduction_axis;
     ReductionOperation _op;
-    BorderSize         _border_size;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index 2881161d7f..b334a11227 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
+#include "src/core/NEON/kernels/NERemapKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
@@ -175,6 +178,8 @@ void NERemapKernel::remap_nearest(const Window &window)
 
 void NERemapKernel::remap_bilinear(const Window &window)
 {
+    using namespace scale_helpers;
+
     // Don't increment in X and Y direction for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
     Window win_in(window);
diff --git a/arm_compute/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NERemapKernel.h
rename to src/core/NEON/kernels/NERemapKernel.h
index 34c80a38d9..8fe1ba5855 100644
--- a/arm_compute/core/NEON/kernels/NERemapKernel.h
+++ b/src/core/NEON/kernels/NERemapKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NEREMAPKERNEL_H
 #define ARM_COMPUTE_NEREMAPKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index 317bc25967..0dcb439665 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h"
+#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
 #include <cstdint>
diff --git a/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h b/src/core/NEON/kernels/NEReorgLayerKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
rename to src/core/NEON/kernels/NEReorgLayerKernel.h
index d751a6b24c..eac91154a1 100644
--- a/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEREORGLAYERKERNEL_H
 #define ARM_COMPUTE_NEREORGLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index 23b349b443..462404f996 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
@@ -21,18 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
+#include "src/core/NEON/kernels/NEReshapeLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstdint>
 
diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/src/core/NEON/kernels/NEReshapeLayerKernel.h
similarity index 75%
rename from arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
rename to src/core/NEON/kernels/NEReshapeLayerKernel.h
index a4b8426e41..ecec8d9f1f 100644
--- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NERESHAPELAYERKERNEL_H
 #define ARM_COMPUTE_NERESHAPELAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -40,6 +40,18 @@ class NEReshapeLayerKernel : public INEKernel
     {
         return "NEReshapeLayerKernel";
     }
+    /** Default constructor */
+    NEReshapeLayerKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReshapeLayerKernel(const NEReshapeLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReshapeLayerKernel &operator=(const NEReshapeLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEReshapeLayerKernel(NEReshapeLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEReshapeLayerKernel &operator=(NEReshapeLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEReshapeLayerKernel() = default;
     /** Set the input and output info of the kernel
      *
      * @param[in]  input  Source tensor info. Data type supported: All
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index 2e6135b44c..21c758053a 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -21,12 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
+#include "src/core/NEON/kernels/NEReverseKernel.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NEReverseKernel.h b/src/core/NEON/kernels/NEReverseKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEReverseKernel.h
rename to src/core/NEON/kernels/NEReverseKernel.h
index fda79154a0..07b547a327 100644
--- a/arm_compute/core/NEON/kernels/NEReverseKernel.h
+++ b/src/core/NEON/kernels/NEReverseKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEREVERSEKERNEL_H
 #define ARM_COMPUTE_NEREVERSEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 94fcfe2ff8..5a6d49bf07 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -21,24 +21,39 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "src/core/NEON/kernels/NEScaleKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/core/utils/misc/Utility.h"
-
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
+#include "support/Rounding.h"
 
 #include <arm_neon.h>
+#include <map>
 
 namespace arm_compute
 {
 namespace
 {
+inline float compute_bilinear(float a00, float a01, float a10, float a11, float dx_val, float dy_val)
+{
+    const float dx1_val = 1.0f - dx_val;
+    const float dy1_val = 1.0f - dy_val;
+
+    const float w1 = dx1_val * dy1_val;
+    const float w2 = dx_val * dy1_val;
+    const float w3 = dx1_val * dy_val;
+    const float w4 = dx_val * dy_val;
+    return a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
                           const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
 {
@@ -48,8 +63,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(output == input);
     ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
-    ARM_COMPUTE_RETURN_ERROR_ON(!info.use_padding && info.border_mode != BorderMode::CONSTANT);
     ARM_COMPUTE_UNUSED(info.constant_border_value);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported");
 
     const DataLayout data_layout   = input->data_layout();
     const auto       width_index   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -71,7 +86,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
 
     if(info.interpolation_policy == InterpolationPolicy::AREA)
     {
@@ -81,265 +96,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output,
-                                                             const ScaleKernelInfo &info, BorderSize border_size)
-{
-    bool   window_changed{ false };
-    Window win{};
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Configure kernel window
-    win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
-    const ValidRegion &input_valid_region = input->valid_region();
-
-    if(offsets != nullptr)
-    {
-        AccessWindowHorizontal offsets_access(offsets, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, offsets_access);
-    }
-    if(dx != nullptr && dy != nullptr)
-    {
-        AccessWindowHorizontal dx_access(dx, 0, num_elems_processed_per_iteration);
-        AccessWindowHorizontal dy_access(dy, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, dx_access, dy_access);
-    }
-
-    // Reads can occur within the valid region of the input
-    AccessWindowStatic input_access(input, input_valid_region.anchor[0] - border_size.left,
-                                    input_valid_region.anchor[1] - border_size.top,
-                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size.right,
-                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size.bottom);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, calculate_valid_region_scale(*input, output->tensor_shape(),
-                                                                     info.interpolation_policy, info.sampling_policy, info.border_mode == BorderMode::UNDEFINED));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window_nhwc(ITensorInfo *input, ITensorInfo *output, const ScaleKernelInfo &info, BorderSize border_size)
-{
-    bool   window_changed{ false };
-    Window win{};
-
-    const unsigned int num_elems_processed_per_iteration = (info.use_padding && info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR) ? 16 / input->element_size() : 1;
-
-    // Configure kernel window
-    win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
-    if(info.use_padding)
-    {
-        AccessWindowStatic     input_access(input, 0, -border_size.top, ceil_to_multiple(input->tensor_shape()[0], num_elems_processed_per_iteration), input->tensor_shape()[1]);
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = update_window_and_padding(win, input_access, output_access);
-        output->set_valid_region(calculate_valid_region_scale(*input, output->tensor_shape(), info.interpolation_policy, info.sampling_policy, info.border_mode == BorderMode::UNDEFINED));
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output,
-                                                        const ScaleKernelInfo &info, BorderSize border_size)
-{
-    std::pair<Status, Window> win_config;
-    switch(input->data_layout())
-    {
-        case DataLayout::NCHW:
-            if(!info.use_padding)
-            {
-                return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Padding required for NCHW"), Window{});
-            }
-            win_config = validate_and_configure_window_nchw(input, dx, dy, offsets, output, info, border_size);
-            break;
-        case DataLayout::NHWC:
-            win_config = validate_and_configure_window_nhwc(input, output, info, border_size);
-            break;
-        default:
-            win_config = std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported data layout!"), Window{});
-    }
-
-    return win_config;
-}
-
-template <typename T>
-inline void scale_nearest_nhwc_core(const ITensor *input, const ITensor *offsets, ITensor *output,
-                                    float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, float sampling_offset, bool align_corners)
-{
-    const int  window_step_x  = 16 / sizeof(T);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    window.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(input, win_in);
-    Iterator out(output, window);
-
-    const size_t offsets_stride = stride_w / sizeof(T);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-        const auto    in_yi      = static_cast<int>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_yi * stride_h;
-        int32_t       x          = window_start_x;
-        for(; x < window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
-                            wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + offset * offsets_stride + offset_row + x * stride_c)));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<T *>(out.ptr()) + x) =
-                *(reinterpret_cast<const T *>(in.ptr() + offset * offsets_stride + offset_row + x * stride_c));
-        }
-    },
-    in, out);
-}
-
-template <typename T, typename ConstType>
-inline void scale_bilinear_nhwc_core(const ITensor *input, const ITensor *offsets, const ITensor *dx, const ITensor *dy, ITensor *output,
-                                     float hr, float sampling_offset, Window window, const Window &win_in, size_t stride_w, size_t stride_h,
-                                     size_t stride_c, BorderMode border_mode, PixelValue constant_border_value, bool use_padding)
-{
-    Iterator in(input, win_in);
-    Iterator out(output, window);
-
-    const size_t stride_w_elems = stride_w / sizeof(T);
-    const size_t stride_h_elems = stride_h / sizeof(T);
-
-    const int input_width  = input->info()->dimension(1);
-    const int input_height = input->info()->dimension(2);
-
-    T border_value;
-    if(use_padding && border_mode != BorderMode::REPLICATE)
-    {
-        // configure() sets top border to 0 for BorderMode::REPLICATE and border_value is not needed in execute_window_loop() for REPLICATE
-        border_value = *reinterpret_cast<T *>(input->buffer() + input->info()->offset_first_element_in_bytes() - stride_w);
-    }
-    else
-    {
-        border_value = static_cast<T>(constant_border_value.get<ConstType>());
-    }
-
-    auto is_valid = [](int64_t x, int64_t low_x, int64_t high_x, int64_t y, int64_t low_y, int64_t high_y)
-    {
-        return !(x < low_x || x > high_x || y < low_y || y > high_y);
-    };
-
-    int border_size = (border_mode == BorderMode::UNDEFINED) ? 0 : 1;
-
-    const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto offset     = (*reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())))) / static_cast<int>(sizeof(T));
-        const auto dx_scale   = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-        const auto dy_scale   = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-        const int  in_yi      = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-        const int  offset_row = in_yi * stride_h + id.x() * stride_c;
-        const T   *in_ptr     = reinterpret_cast<T *>(in.ptr() + offset * stride_w + offset_row);
-
-        if(is_valid(offset, -border_size, input_width - 1 + border_size, in_yi, -border_size, input_height - 1 + border_size))
-        {
-            T a00 = 0;
-            T a01 = 0;
-            T a10 = 0;
-            T a11 = 0;
-
-            if(border_mode == BorderMode::CONSTANT)
-            {
-                a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : border_value;
-                a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : border_value;
-                a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : border_value;
-                a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : border_value;
-            }
-            else if(border_mode == BorderMode::REPLICATE)
-            {
-                auto clamped_x  = utility::clamp<int>(offset, 0, input_width - 1);
-                auto clamped_x1 = utility::clamp<int>(offset + 1, 0, input_width - 1);
-                auto clamped_y  = utility::clamp<int>(in_yi, 0, input_height - 1);
-                auto clamped_y1 = utility::clamp<int>(in_yi + 1, 0, input_height - 1);
-
-                a00 = *reinterpret_cast<T *>(in.ptr() + clamped_x * stride_w + clamped_y * stride_h + id.x() * stride_c);
-                a01 = *reinterpret_cast<T *>(in.ptr() + clamped_x1 * stride_w + clamped_y * stride_h + id.x() * stride_c);
-                a10 = *reinterpret_cast<T *>(in.ptr() + clamped_x * stride_w + clamped_y1 * stride_h + id.x() * stride_c);
-                a11 = *reinterpret_cast<T *>(in.ptr() + clamped_x1 * stride_w + clamped_y1 * stride_h + id.x() * stride_c);
-            }
-            else
-            {
-                a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : 0;
-                a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : 0;
-                a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : 0;
-                a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : 0;
-            }
-
-            // Perform interpolation
-            const float dx1 = 1.0f - dx_scale;
-            const float dy1 = 1.0f - dy_scale;
-
-            const float w1 = dx1 * dy1;
-            const float w2 = dx_scale * dy1;
-            const float w3 = dx1 * dy_scale;
-            const float w4 = dx_scale * dy_scale;
-
-            T res = 0;
-            //dequantize quantized input
-            if(input->info()->data_type() == DataType::QASYMM8)
-            {
-                float inp00 = dequantize_qasymm8(a00, iq_info);
-                float inp01 = dequantize_qasymm8(a01, iq_info);
-                float inp10 = dequantize_qasymm8(a10, iq_info);
-                float inp11 = dequantize_qasymm8(a11, iq_info);
-                res         = static_cast<T>(quantize_qasymm8((inp00 * w1 + inp01 * w2 + inp10 * w3 + inp11 * w4), oq_info));
-            }
-            else if(input->info()->data_type() == DataType::QASYMM8_SIGNED)
-            {
-                float inp00 = dequantize_qasymm8_signed(a00, iq_info);
-                float inp01 = dequantize_qasymm8_signed(a01, iq_info);
-                float inp10 = dequantize_qasymm8_signed(a10, iq_info);
-                float inp11 = dequantize_qasymm8_signed(a11, iq_info);
-                res         = static_cast<T>(quantize_qasymm8_signed((inp00 * w1 + inp01 * w2 + inp10 * w3 + inp11 * w4), oq_info));
-            }
-            else
-            {
-                res = static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
-            }
-            // Store result
-            *reinterpret_cast<T *>(out.ptr()) = res;
-        }
-        else
-        {
-            if(border_mode == BorderMode::CONSTANT)
-            {
-                *reinterpret_cast<T *>(out.ptr()) = border_value;
-            }
-            else if(border_mode == BorderMode::REPLICATE)
-            {
-                auto clamped_x                    = utility::clamp<int>(offset, 0, input_width - 1);
-                auto clamped_y                    = utility::clamp<int>(in_yi, 0, input_height - 1);
-                *reinterpret_cast<T *>(out.ptr()) = *reinterpret_cast<T *>(in.ptr() + clamped_x * stride_w + clamped_y * stride_h + id.x() * stride_c);
-            }
-        }
-    },
-    in, out);
-}
 } // namespace
 
 NEScaleKernel::NEScaleKernel()
-    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode(), _constant_border_value(PixelValue()),
-      _sampling_offset(0), _use_padding(true), _align_corners(false)
-{
-}
-
-BorderSize NEScaleKernel::border_size() const
+    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_mode(), _constant_border_value(PixelValue()), _sampling_offset(0),
+      _align_corners(false)
 {
-    return _border_size;
 }
 
 void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets,
@@ -365,10 +127,8 @@ void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITe
     _dx                    = dx;
     _dy                    = dy;
     _policy                = info.interpolation_policy;
-    _border_size           = BorderSize(1);
     _border_mode           = info.border_mode;
     _constant_border_value = info.constant_border_value;
-    _use_padding           = info.use_padding;
     _align_corners         = info.align_corners;
 
     if(info.sampling_policy == SamplingPolicy::CENTER)
@@ -377,58 +137,85 @@ void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITe
     }
 
     // Compute the ratio between source width/height and destination width/height
-    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), _align_corners);
-    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), _align_corners);
+    const auto wr = scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), _align_corners);
+    const auto hr = scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), _align_corners);
+
+    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+    const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy;
 
-    // Add constant border only on top in case of NHWC layout
-    if(data_layout == DataLayout::NHWC)
+    if(_border_mode == BorderMode::UNDEFINED)
     {
-        _border_size = (info.border_mode != BorderMode::REPLICATE && info.interpolation_policy == InterpolationPolicy::BILINEAR && info.use_padding) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
+        _border_mode           = BorderMode::CONSTANT;
+        _constant_border_value = PixelValue();
     }
+    std::string function_to_call("scale_");
+    function_to_call += string_from_data_type(_input->info()->data_type()) + "_";
+    function_to_call += string_from_data_layout(_input->info()->data_layout()) + "_";
+    function_to_call += string_from_interpolation_policy(policy_to_use);
 
-    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy;
+    static std::map<std::string, ScaleFunctionPtr> map_function =
+    {
+        { "scale_U8_NCHW_AREA_CONSTANT", &NEScaleKernel::scale_area_nchw_u8 },
 
-    // Select interpolation function
-    switch(policy_to_use)
+        { "scale_U8_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw<uint8_t> },
+        { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint8_t> },
+
+        { "scale_U8_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc<uint8_t> },
+        { "scale_U8_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint8_t> },
+
+        { "scale_QASYMM8_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm<uint8_t> },
+        { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint8_t> },
+
+        { "scale_QASYMM8_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm<uint8_t> },
+        { "scale_QASYMM8_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint8_t> },
+
+        { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm<int8_t> },
+        { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint8_t> },
+
+        { "scale_QASYMM8_SIGNED_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm<int8_t> },
+        { "scale_QASYMM8_SIGNED_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint8_t> },
+
+        { "scale_S16_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw<int16_t> },
+        { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint16_t> },
+
+        { "scale_S16_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc<int16_t> },
+        { "scale_S16_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint16_t> },
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        { "scale_F16_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw<float16_t> },
+        { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint16_t> },
+
+        { "scale_F16_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc<float16_t> },
+        { "scale_F16_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint16_t> },
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+        { "scale_F32_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw<float> },
+        { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<float> },
+
+        { "scale_F32_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc<float> },
+        { "scale_F32_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<float> },
+    };
+    auto it = map_function.find(function_to_call);
+    if(it != map_function.end())
     {
-        case InterpolationPolicy::NEAREST_NEIGHBOR:
-        {
-            _func = (data_layout == DataLayout::NCHW) ? &NEScaleKernel::scale_nearest_nchw : &NEScaleKernel::scale_nhwc;
-            break;
-        }
-        case InterpolationPolicy::BILINEAR:
-        {
-            _func = (data_layout == DataLayout::NCHW) ? &NEScaleKernel::scale_bilinear_nchw : &NEScaleKernel::scale_nhwc;
-            break;
-        }
-        case InterpolationPolicy::AREA:
-        {
-            _func = &NEScaleKernel::scale_area_nchw;
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+        _func = it->second;
     }
 
     // Configure window
-    std::pair<Status, Window> win_config = validate_and_configure_window(input->info(),
-                                                                         dx != nullptr ? dx->info() : nullptr,
-                                                                         dy != nullptr ? dy->info() : nullptr,
-                                                                         offsets != nullptr ? offsets->info() : nullptr,
-                                                                         output->info(),
-                                                                         info, border_size());
-
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    Window      win = calculate_max_window(*output->info(), Steps());
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+    INEKernel::configure(win);
 }
 
+template <typename T>
 void NEScaleKernel::scale_nearest_nchw(const Window &window)
 {
-    const size_t input_stride = _input->info()->strides_in_bytes()[1];
+    const size_t in_stride_x = _input->info()->dimension(0) + _input->info()->padding().left + _input->info()->padding().right;
 
     // Compute the ratio between source height and destination height
-    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
+    const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
 
     // Don't increment in X and Y direction for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -449,216 +236,24 @@ void NEScaleKernel::scale_nearest_nchw(const Window &window)
     Iterator in(_input, win_in);
     Iterator out(_output, window);
     Iterator offsets(_offsets, win_off);
-
-    switch(_input->info()->data_type())
+    execute_window_loop(window, [&](const Coordinates & id)
     {
-        case DataType::QASYMM8_SIGNED:
-        {
-            int8x16_t tmp = vdupq_n_s8(0);
-
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const uint8_t *const in_ptr      = in.ptr();
-
-                const auto in_yi         = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
-                const int  in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
-                ARM_COMPUTE_ERROR_ON(in_yi_clamped < -1 || in_yi_clamped > static_cast<int>(_input->info()->dimension(1)));
-                const int offset_row = in_yi_clamped * input_stride;
-
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[2] + offset_row], tmp, 2);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[3] + offset_row], tmp, 3);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[4] + offset_row], tmp, 4);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[5] + offset_row], tmp, 5);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[6] + offset_row], tmp, 6);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[7] + offset_row], tmp, 7);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[8] + offset_row], tmp, 8);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[9] + offset_row], tmp, 9);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[10] + offset_row], tmp, 10);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[11] + offset_row], tmp, 11);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[12] + offset_row], tmp, 12);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[13] + offset_row], tmp, 13);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[14] + offset_row], tmp, 14);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[15] + offset_row], tmp, 15);
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr()), tmp);
-            },
-            in, offsets, out);
-            break;
-        }
-        case DataType::QASYMM8:
-        case DataType::U8:
-        {
-            uint8x16_t tmp = vdupq_n_u8(0);
-
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const uint8_t *const in_ptr      = in.ptr();
-
-                const auto in_yi         = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
-                const int  in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
-                ARM_COMPUTE_ERROR_ON(in_yi_clamped < -1 || in_yi_clamped > static_cast<int>(_input->info()->dimension(1)));
-                const int offset_row = in_yi_clamped * input_stride;
-
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[2] + offset_row], tmp, 2);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[3] + offset_row], tmp, 3);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[4] + offset_row], tmp, 4);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[5] + offset_row], tmp, 5);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[6] + offset_row], tmp, 6);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[7] + offset_row], tmp, 7);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[8] + offset_row], tmp, 8);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[9] + offset_row], tmp, 9);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[10] + offset_row], tmp, 10);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[11] + offset_row], tmp, 11);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[12] + offset_row], tmp, 12);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[13] + offset_row], tmp, 13);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[14] + offset_row], tmp, 14);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[15] + offset_row], tmp, 15);
-
-                vst1q_u8(out.ptr(), tmp);
-            },
-            in, offsets, out);
-            break;
-        }
-        case DataType::S16:
-        {
-            int16x8x2_t tmp =
-            {
-                {
-                    vdupq_n_s16(0),
-                    vdupq_n_s16(0)
-                }
-            };
-
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto in_yi       = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
-                const int  offset_row  = in_yi * input_stride;
-
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7);
-
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7);
-
-                vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
-            },
-            in, offsets, out);
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            float16x8x2_t tmp =
-            {
-                {
-                    vdupq_n_f16(0),
-                    vdupq_n_f16(0)
-                }
-            };
-
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto in_yi       = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
-                const int  offset_row  = in_yi * input_stride;
-
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7);
-
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7);
-
-                vst2q_f16(reinterpret_cast<__fp16 *>(out.ptr()), tmp);
-            },
-            in, offsets, out);
-            break;
-        }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-        {
-            float32x4x4_t tmp =
-            {
-                {
-                    vdupq_n_f32(0),
-                    vdupq_n_f32(0),
-                    vdupq_n_f32(0),
-                    vdupq_n_f32(0)
-                }
-            };
-
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto in_yi       = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
-                const int  offset_row  = in_yi * input_stride;
-
-                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3);
-
-                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3);
-
-                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0);
-                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1);
-                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2);
-                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3);
-
-                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0);
-                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1);
-                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2);
-                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3);
-
-                vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
-            },
-            in, offsets, out);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
+        const auto    offsets_ptr         = reinterpret_cast<const int32_t *>(offsets.ptr());
+        const auto    in_yi               = static_cast<int32_t>(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
+        const int32_t offset_row          = in_yi * in_stride_x;
+        *reinterpret_cast<T *>(out.ptr()) = *(reinterpret_cast<const T *>(in.ptr()) + offsets_ptr[0] + offset_row);
+    },
+    in, offsets, out);
 }
 
+template <typename T>
 void NEScaleKernel::scale_bilinear_nchw(const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::F32);
-
     // Compute the ratio between source height and destination height
-    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
+    const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
+    Window     win_off;
+    win_off.set(Window::DimX, window.x());
+    win_off.set(Window::DimY, window.y());
 
     // Don't increment in X and Y direction for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -666,10 +261,6 @@ void NEScaleKernel::scale_bilinear_nchw(const Window &window)
     win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    Window win_off;
-    win_off.set(Window::DimX, window.x());
-    win_off.set(Window::DimY, window.y());
-
     for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
     {
         win_off.set(d, Window::Dimension(0, 0, 0));
@@ -681,272 +272,75 @@ void NEScaleKernel::scale_bilinear_nchw(const Window &window)
     Iterator dx(_dx, win_off);
     Iterator dy(_dy, win_off);
 
-    /* Input image stride */
-    const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1];
-    const size_t in_stride         = in_stide_in_bytes / _input->info()->element_size();
-
-    const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
+    const int32_t in_dim_w    = _input->info()->dimension(0);
+    const int32_t in_dim_h    = _input->info()->dimension(1);
+    const int32_t in_stride_w = in_dim_w + _input->info()->padding().left + _input->info()->padding().right;
 
-    switch(_input->info()->data_type())
+    if(_border_mode == BorderMode::CONSTANT)
     {
-        case DataType::QASYMM8_SIGNED:
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-                const auto in_ptr      = reinterpret_cast<const int8_t *>(in.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-                const int offset_row = in_yi * in_stide_in_bytes;
-
-                int8x8_t tmp0 = vdup_n_s8(0);
-
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0], iq_info, oq_info), tmp0, 0);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1], iq_info, oq_info), tmp0, 1);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2], iq_info, oq_info), tmp0, 2);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3], iq_info, oq_info), tmp0, 3);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4], iq_info, oq_info), tmp0, 4);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5], iq_info, oq_info), tmp0, 5);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6], iq_info, oq_info), tmp0, 6);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7], iq_info, oq_info), tmp0, 7);
-
-                int8x8_t tmp1 = vdup_n_s8(0);
-
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8], iq_info, oq_info), tmp1, 0);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9], iq_info, oq_info), tmp1, 1);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10], iq_info, oq_info), tmp1, 2);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11], iq_info, oq_info), tmp1, 3);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12], iq_info, oq_info), tmp1, 4);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13], iq_info, oq_info), tmp1, 5);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14], iq_info, oq_info), tmp1, 6);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15], iq_info, oq_info), tmp1, 7);
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr()), vcombine_s8(tmp0, tmp1));
-            },
-            in, offsets, dx, dy, out);
-            break;
-        }
-        case DataType::QASYMM8:
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-                const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-                const int offset_row = in_yi * in_stide_in_bytes;
-
-                uint8x8_t tmp0 = vdup_n_u8(0);
-
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0], iq_info, oq_info), tmp0, 0);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1], iq_info, oq_info), tmp0, 1);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2], iq_info, oq_info), tmp0, 2);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3], iq_info, oq_info), tmp0, 3);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4], iq_info, oq_info), tmp0, 4);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5], iq_info, oq_info), tmp0, 5);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6], iq_info, oq_info), tmp0, 6);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7], iq_info, oq_info), tmp0, 7);
-
-                uint8x8_t tmp1 = vdup_n_u8(0);
-
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8], iq_info, oq_info), tmp1, 0);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9], iq_info, oq_info), tmp1, 1);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10], iq_info, oq_info), tmp1, 2);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11], iq_info, oq_info), tmp1, 3);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12], iq_info, oq_info), tmp1, 4);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13], iq_info, oq_info), tmp1, 5);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14], iq_info, oq_info), tmp1, 6);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15], iq_info, oq_info), tmp1, 7);
-
-                vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
-            },
-            in, offsets, dx, dy, out);
-            break;
-        }
-        case DataType::U8:
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-                const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-                const int offset_row = in_yi * in_stide_in_bytes;
-
-                uint8x8_t tmp0 = vdup_n_u8(0);
-
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
-
-                uint8x8_t tmp1 = vdup_n_u8(0);
-
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
-
-                vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
-            },
-            in, offsets, dx, dy, out);
-            break;
-        }
-        case DataType::S16:
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-                const int offset_row = in_yi * in_stide_in_bytes;
-
-                int16x8x2_t tmp =
-                {
-                    {
-                        vdupq_n_s16(0),
-                        vdupq_n_s16(0)
-                    }
-                };
-
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
-
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
-
-                vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
-            },
-            in, offsets, dx, dy, out);
-            break;
-        }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-                const int offset_row = in_yi * in_stide_in_bytes;
-
-                float16x8x2_t tmp =
-                {
-                    {
-                        vdupq_n_f16(0),
-                        vdupq_n_f16(0)
-                    }
-                };
-
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
-
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
-
-                vst2q_f16(reinterpret_cast<__fp16 *>(out.ptr()), tmp);
-            },
-            in, offsets, dx, dy, out);
-            break;
-        }
+        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        using ConstType = T;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
+        const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-                const int offset_row = in_yi * in_stide_in_bytes;
-
-                float32x4x4_t tmp =
-                {
-                    {
-                        vdupq_n_f32(0),
-                        vdupq_n_f32(0),
-                        vdupq_n_f32(0),
-                        vdupq_n_f32(0)
-                    }
-                };
-
-                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 3);
-
-                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 3);
-
-                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[2], 0);
-                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[2], 1);
-                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[2], 2);
-                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[2], 3);
-
-                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[3], 0);
-                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[3], 1);
-                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[3], 2);
-                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[3], 3);
-
-                vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
-            },
-            in, offsets, dx, dy, out);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
+            const int32_t index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
+            const auto    index_w       = *(reinterpret_cast<const int32_t *>(offsets.ptr()));
+            const auto    dx_val        = *(reinterpret_cast<const float *>(dx.ptr()));
+            const auto    dy_val        = *(reinterpret_cast<const float *>(dy.ptr()));
+            const auto    pixel_row_ptr = reinterpret_cast<const T *>(in.ptr());
+
+            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value;
+            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value;
+            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h
+                              && index_h < in_dim_h - 1) ?
+                             (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) :
+                             const_border_value;
+            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h
+                              && index_h < in_dim_h - 1) ?
+                             (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) :
+                             const_border_value;
+
+            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, offsets, dx, dy, out);
+    }
+    else if(_border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const int  index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
+            const auto index_w       = *(reinterpret_cast<const int32_t *>(offsets.ptr()));
+            const auto dx_val        = *(reinterpret_cast<const float *>(dx.ptr()));
+            const auto dy_val        = *(reinterpret_cast<const float *>(dy.ptr()));
+            const auto pixel_row_ptr = reinterpret_cast<const T *>(in.ptr());
+
+            auto clamped_x  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+            auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+            auto clamped_y  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+            auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+
+            const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
+            const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
+            const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
+            const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
+
+            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, offsets, dx, dy, out);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
     }
 }
 
-void NEScaleKernel::scale_area_nchw(const Window &window)
+void NEScaleKernel::scale_area_nchw_u8(const Window &window)
 {
+    using namespace scale_helpers;
+
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
 
     // Don't increment in width/height/channels for the input tensor
@@ -959,8 +353,8 @@ void NEScaleKernel::scale_area_nchw(const Window &window)
     Iterator in(_input, win_in);
     Iterator out(_output, window);
 
-    const auto   wr        = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(0), _output->info()->dimension(0), _align_corners);
-    const auto   hr        = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
+    const auto   wr        = scale_utils::calculate_resize_ratio(_input->info()->dimension(0), _output->info()->dimension(0), _align_corners);
+    const auto   hr        = scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
     const auto   w         = _input->info()->dimension(0);
     const auto   h         = _input->info()->dimension(1);
     const size_t in_stride = _input->info()->strides_in_bytes()[1];
@@ -994,123 +388,232 @@ void NEScaleKernel::scale_area_nchw(const Window &window)
     in, out);
 }
 
-void NEScaleKernel::scale_nhwc(const Window &window)
+template <typename T>
+void NEScaleKernel::scale_nearest_nhwc(const Window &window)
 {
-    // Get data layout and width/height indices
-    const DataLayout data_layout  = DataLayout::NHWC;
-    const int        idx_channels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const int        idx_width    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t in_stride_c  = _input->info()->dimension(0) + _input->info()->padding().left + _input->info()->padding().right;
+    const size_t in_stride_w  = _input->info()->dimension(1) + _input->info()->padding().top + _input->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = _input->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, _output->info()->dimension(2), _align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+    const int  window_step_x  = 16 / sizeof(T);
 
-    const size_t input_stride_w = _input->info()->strides_in_bytes()[idx_width];
-    const size_t input_stride_h = _input->info()->strides_in_bytes()[idx_height];
-    const size_t input_stride_c = _input->info()->strides_in_bytes()[idx_channels];
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(_output, win);
 
+    const uint8_t     *in_ptr_start        = _input->buffer() + _input->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = _input->info()->strides_in_bytes()[3];
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const int32_t offset     = *reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+        const auto    in_hi      = static_cast<int>(_align_corners ? utils::rounding::round_half_away_from_zero((id.z() + _sampling_offset) * hr) : std::floor((id.z() + _sampling_offset) * hr));
+        const int     offset_row = in_hi * in_stride_wc;
+        int32_t       x          = window_start_x;
+        const T      *in_ptr     = reinterpret_cast<const T *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+        for(; x <= window_end_x - window_step_x; x += window_step_x)
+        {
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
+                            wrapper::vloadq(in_ptr + offset + offset_row + x));
+        }
+        for(; x < window_end_x; ++x)
+        {
+            *(reinterpret_cast<T *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+        }
+    },
+    out);
+}
+
+template <typename T>
+void NEScaleKernel::scale_bilinear_nhwc(const Window &window)
+{
     // Compute the ratio between source height and destination height
-    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(idx_height), _output->info()->dimension(idx_height), _align_corners);
+    const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(2), _output->info()->dimension(2), _align_corners);
 
-    // Don't increment in width/height/channels for the input tensor
+    Iterator  out(_output, window);
+    const int in_stride_c  = _input->info()->dimension(0) + _input->info()->padding().left + _input->info()->padding().right;
+    const int in_dim_w     = _input->info()->dimension(1);
+    const int in_dim_h     = _input->info()->dimension(2);
+    const int in_stride_wc = in_stride_c * (in_dim_w + _input->info()->padding().top + _input->info()->padding().bottom);
+
+    // Don't increment in Y and Z direction for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
     Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
     win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    Iterator in(_input, win_in);
 
-    switch(_input->info()->data_type())
+    if(_border_mode == BorderMode::CONSTANT)
     {
-        case DataType::QASYMM8_SIGNED:
-        {
-            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-            {
-                scale_nearest_nhwc_core<int8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
-            }
-            else
-            {
-                scale_bilinear_nhwc_core<int8_t, int8_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                         window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
-            }
-            break;
-        }
-        case DataType::QASYMM8:
-        case DataType::U8:
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        using ConstType = T;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-            {
-                scale_nearest_nhwc_core<uint8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
-            }
-            else
-            {
-                scale_bilinear_nhwc_core<uint8_t, uint8_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                           window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
-            }
-            break;
-        }
-        case DataType::S16:
+            const auto    offset = *reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto    dx_val = *reinterpret_cast<const float *>(_dx->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto    dy_val = *reinterpret_cast<const float *>(_dy->ptr_to_element(Coordinates(id.y(), id.z())));
+            const int32_t in_hi  = std::floor((id.z() + _sampling_offset) * hr - _sampling_offset);
+            const T      *in_ptr = reinterpret_cast<const T *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
+            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
+            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
+
+            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, out);
+    }
+    else if(_border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-            {
-                scale_nearest_nhwc_core<int16_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
-            }
-            else
-            {
-                scale_bilinear_nhwc_core<int16_t, int16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                           window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
-            }
-            break;
-        }
+            const auto offset = *reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto dx_val = *reinterpret_cast<const float *>(_dx->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto dy_val = *reinterpret_cast<const float *>(_dy->ptr_to_element(Coordinates(id.y(), id.z())));
+            const int  in_hi  = std::floor((id.z() + _sampling_offset) * hr - _sampling_offset);
+
+            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+            const auto a00 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
+            const auto a01 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
+            const auto a10 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
+            const auto a11 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
+
+            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, out);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+template <typename T>
+void NEScaleKernel::scale_bilinear_qasymm(const Window &window)
+{
+    // Get data layout and width/height indices
+    const DataLayout data_layout = _input->info()->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    // Compute the ratio between source height and destination height
+    const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(idx_height), _output->info()->dimension(idx_height), _align_corners);
+    Window     win_off;
+    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(idx_width, Window::Dimension(0, 0, 0));
+    win_in.set(idx_height, Window::Dimension(0, 0, 0));
+
+    for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
+    {
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int32_t in_dim_w = _input->info()->dimension(idx_width);
+    const int32_t in_dim_h = _input->info()->dimension(idx_height);
+    const int32_t stride_w = _input->info()->strides_in_bytes()[idx_width];
+    const int32_t stride_h = _input->info()->strides_in_bytes()[idx_height];
+
+    const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
+
+    if(_border_mode == BorderMode::CONSTANT)
+    {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-            {
-                scale_nearest_nhwc_core<float16_t>(_input, _offsets, _output, hr,
-                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
-            }
-            else
-            {
-                scale_bilinear_nhwc_core<float16_t, half>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                          window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
-            }
-            break;
-        }
+        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        using ConstType = T;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
+        const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-            {
-                scale_nearest_nhwc_core<float>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
-            }
-            else
-            {
-                scale_bilinear_nhwc_core<float, float>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                       window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
+            const int32_t index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
+            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+            const auto    dx_val        = *(reinterpret_cast<const float *>(_dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+            const auto    dy_val        = *(reinterpret_cast<const float *>(_dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+            const auto    pixel_row_ptr = reinterpret_cast<const T *>(in.ptr());
+
+            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
+                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
+                             const_border_value;
+            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
+                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
+                             const_border_value;
+            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
+                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
+                             const_border_value;
+            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
+                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
+                             const_border_value;
+
+            const float inp00                 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+            const float inp01                 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+            const float inp10                 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+            const float inp11                 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+            *reinterpret_cast<T *>(out.ptr()) = Qasymm8QuantizationHelper<T>::quantize(compute_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+        },
+        in, out);
+    }
+    else if(_border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const int     index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
+            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+            const auto    dx_val        = *(reinterpret_cast<const float *>(_dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+            const auto    dy_val        = *(reinterpret_cast<const float *>(_dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+            const auto    pixel_row_ptr = reinterpret_cast<const T *>(in.ptr());
+
+            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+
+            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
+            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
+            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
+            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
+
+            const float inp00                 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+            const float inp01                 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+            const float inp10                 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+            const float inp11                 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+            *reinterpret_cast<T *>(out.ptr()) = Qasymm8QuantizationHelper<T>::quantize(compute_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+        },
+        in, out);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
     }
 }
 
 Status NEScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
                                const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
 {
-    BorderSize border_size(1);
-    if(input->data_layout() == DataLayout::NHWC)
-    {
-        border_size = (info.border_mode == BorderMode::CONSTANT && info.interpolation_policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
-    }
-
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              dx != nullptr ? dx->clone().get() : nullptr,
-                                                              dy != nullptr ? dy->clone().get() : nullptr,
-                                                              offsets != nullptr ? offsets->clone().get() : nullptr,
-                                                              output->clone().get(),
-                                                              info, border_size)
-                                .first);
-
     return Status{};
 }
 
diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/src/core/NEON/kernels/NEScaleKernel.h
similarity index 86%
rename from arm_compute/core/NEON/kernels/NEScaleKernel.h
rename to src/core/NEON/kernels/NEScaleKernel.h
index a2328b13bc..a3786db5b7 100644
--- a/arm_compute/core/NEON/kernels/NEScaleKernel.h
+++ b/src/core/NEON/kernels/NEScaleKernel.h
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_NESCALEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -83,34 +83,44 @@ class NEScaleKernel : public INEKernel
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
 
 private:
-    /** function to perform scale using nearest interpolation on the given window */
-    void scale_nearest_nchw(const Window &window);
-    /** function to perform scale using bilinear interpolation on the given window */
-    void scale_bilinear_nchw(const Window &window);
     /** function to perform scale using area interpolation on the given window
      *
      *  @note Used only in case down-sampling.
      */
-    void scale_area_nchw(const Window &window);
-    /** function to perform scale on the given window */
-    void scale_nhwc(const Window &window);
-    /** Scale function to use for the particular interpolation type passed to configure() */
-    void (NEScaleKernel::*_func)(const Window &window);
+    void scale_area_nchw_u8(const Window &window);
+
+    /** function to perform scale using bilinear interpolation on the given window */
+    template <typename T>
+    void scale_bilinear_nchw(const Window &window);
+    /** function to perform scale using bilinear interpolation on the given window */
+    template <typename T>
+    void scale_bilinear_nhwc(const Window &window);
+    /** function to perform scale using bilinear interpolation on the given window */
+    template <typename T>
+    void scale_bilinear_qasymm(const Window &window);
+
+    /** function to perform scale using nearest neighbour on the given window */
+    template <typename T>
+    void scale_nearest_nchw(const Window &window);
+    /** function to perform scale using nearest neighbour on the given window */
+    template <typename T>
+    void scale_nearest_nhwc(const Window &window);
+
+    /** Scale function to use for the particular function to use */
+    using ScaleFunctionPtr = void (NEScaleKernel::*)(const Window &window);
 
+    ScaleFunctionPtr    _func;
     const ITensor      *_offsets;
     const ITensor      *_dx;
     const ITensor      *_dy;
     const ITensor      *_input;
     ITensor            *_output;
     InterpolationPolicy _policy;
-    BorderSize          _border_size;
     BorderMode          _border_mode;
     PixelValue          _constant_border_value;
     float               _sampling_offset;
-    bool                _use_padding;
     bool                _align_corners;
 };
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
index dcc9362cf0..58b8caa2b6 100644
--- a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
+#include "src/core/NEON/kernels/NEScharr3x3Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
@@ -29,6 +29,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h b/src/core/NEON/kernels/NEScharr3x3Kernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
rename to src/core/NEON/kernels/NEScharr3x3Kernel.h
index 7e1fdb5d9e..920410ebb3 100644
--- a/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
+++ b/src/core/NEON/kernels/NEScharr3x3Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NESCHARR3x3KERNEL_H
 #define ARM_COMPUTE_NESCHARR3x3KERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index 86e8233e0f..9cf9b98a0c 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -21,17 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
+#include "src/core/NEON/kernels/NESelectKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "utils/TypePrinter.h"
 
 #include <arm_neon.h>
@@ -229,7 +231,7 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor
 
 Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y); 
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x);
     ARM_COMPUTE_RETURN_ERROR_ON(x->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, y);
diff --git a/arm_compute/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NESelectKernel.h
rename to src/core/NEON/kernels/NESelectKernel.h
index bb8695f598..f7142feff8 100644
--- a/arm_compute/core/NEON/kernels/NESelectKernel.h
+++ b/src/core/NEON/kernels/NESelectKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NESELECTKERNEL_H
 #define ARM_COMPUTE_NESELECTKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.cpp b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
index eb9d3c3020..ecf6b59c29 100644
--- a/src/core/NEON/kernels/NESobel3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
+#include "src/core/NEON/kernels/NESobel3x3Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
@@ -29,6 +29,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h b/src/core/NEON/kernels/NESobel3x3Kernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
rename to src/core/NEON/kernels/NESobel3x3Kernel.h
index 66a13c4c26..2c3eaf5eb7 100644
--- a/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
+++ b/src/core/NEON/kernels/NESobel3x3Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NESOBEL3x3KERNEL_H
 #define ARM_COMPUTE_NESOBEL3x3KERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.cpp b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
index fc8ccc803d..5a66b1f364 100644
--- a/src/core/NEON/kernels/NESobel5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
diff --git a/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h b/src/core/NEON/kernels/NESobel5x5Kernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
rename to src/core/NEON/kernels/NESobel5x5Kernel.h
index 02029b6a47..bd5eb29296 100644
--- a/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
+++ b/src/core/NEON/kernels/NESobel5x5Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NESOBEL5x5KERNEL_H
 #define ARM_COMPUTE_NESOBEL5x5KERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
index 95ab12b6cd..835b333a10 100644
--- a/src/core/NEON/kernels/NESobel7x7Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
+#include "src/core/NEON/kernels/NESobel7x7Kernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstdint>
diff --git a/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h b/src/core/NEON/kernels/NESobel7x7Kernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
rename to src/core/NEON/kernels/NESobel7x7Kernel.h
index 0e8b82c96a..c5a3899bab 100644
--- a/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NESOBEL7x7KERNEL_H
 #define ARM_COMPUTE_NESOBEL7x7KERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index bc5b0c0696..97797cefde 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -21,21 +21,23 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "src/core/NEON/kernels/NESoftmaxLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/SaturateCast.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/SaturateCast.h"
 
 #include <algorithm>
 #include <arm_neon.h>
diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/src/core/NEON/kernels/NESoftmaxLayerKernel.h
similarity index 89%
rename from arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
rename to src/core/NEON/kernels/NESoftmaxLayerKernel.h
index e80cd222c5..adc2e57258 100644
--- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H
 #define ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -41,6 +41,16 @@ class NELogits1DMaxKernel : public INESimpleKernel
     }
     /** Default constructor */
     NELogits1DMaxKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DMaxKernel(const NELogits1DMaxKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DMaxKernel &operator=(const NELogits1DMaxKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELogits1DMaxKernel(NELogits1DMaxKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELogits1DMaxKernel &operator=(NELogits1DMaxKernel &&) = default;
+    /** Default destructor */
+    ~NELogits1DMaxKernel() = default;
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
index e2fe88cc0e..673eace3c1 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
@@ -21,14 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+#include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <arm_neon.h>
 #include <cstdint>
 
@@ -38,15 +41,16 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *padddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, padddings, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(padddings->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(padddings->tensor_shape()[1] != block_info->tensor_shape()[0]);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
 
     // Validate output if initialized
     if(output->total_size() != 0)
@@ -55,6 +59,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};
@@ -64,22 +69,14 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
 
     // Validate output if initialized
     if(output->total_size() != 0)
     {
-        const DataLayout data_layout = input->data_layout();
-        const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-        const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-        const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] < padding_left.x() + padding_right.y());
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) % block_shape_x != 0);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) % block_shape_y != 0);
-        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
+        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -95,7 +92,7 @@ NESpaceToBatchLayerKernel::NESpaceToBatchLayerKernel()
 
 void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
 
     _input       = input;
diff --git a/arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
similarity index 81%
rename from arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h
rename to src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
index b5d7c692f0..44b8cbb514 100644
--- a/arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
 #define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -55,8 +55,8 @@ class NESpaceToBatchLayerKernel : public INEKernel
     /** Initialise the kernel's inputs and output.
      *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in]  block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in]  paddings    2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output      Tensor output. Data types supported: same as @p input
      */
     void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output);
@@ -65,16 +65,16 @@ class NESpaceToBatchLayerKernel : public INEKernel
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
-     * @param[in]  padding_left  The left padding of the output tensor.
-     * @param[in]  padding_right The right padding of the output tensor.
+     * @param[in]  padding_left  The padding at the beginning of every dimension of the output tensor.
+     * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
     void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in] paddings    2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in] block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in] paddings    2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[in] output      Tensor output. Data types supported: same as @p input
      *
      * @return a status
@@ -85,8 +85,8 @@ class NESpaceToBatchLayerKernel : public INEKernel
      * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in] block_shape_x Block shape x value.
      * @param[in] block_shape_y Block shape y value.
-     * @param[in] padding_left  The left padding of the output tensor.
-     * @param[in] padding_right The right padding of the output tensor.
+     * @param[in] padding_left  The padding at the beginning of every dimension of the output tensor.
+     * @param[in] padding_right The padding at the end of every dimension of the output tensor.
      * @param[in] output        Tensor output. Data types supported: same as @p input
      *
      * @return a status
@@ -98,8 +98,8 @@ class NESpaceToBatchLayerKernel : public INEKernel
 
 private:
     const ITensor *_input;       /**< Source tensor */
-    const ITensor *_block_shape; /**< Block shape tensor */
-    const ITensor *_paddings;    /**< Paddings tensor */
+    const ITensor *_block_shape; /**< Block shape tensor for dynamic evaluation */
+    const ITensor *_paddings;    /**< Paddings tensor for dynamic evaluation */
     ITensor       *_output;      /**< Destination tensor */
     DataLayout     _data_layout; /**< Data layout to be used at run-time */
 
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index b342cd2047..7687c50c40 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
+#include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <arm_neon.h>
 #include <cstdint>
 
diff --git a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
rename to src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
index 11443e02c5..953b68a401 100644
--- a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
 #define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 1d44be60a0..55170a169a 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEStackLayerKernel.h"
+#include "src/core/NEON/kernels/NEStackLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -33,6 +33,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
diff --git a/arm_compute/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEStackLayerKernel.h
rename to src/core/NEON/kernels/NEStackLayerKernel.h
index 710a6be7f4..9b0a039b88 100644
--- a/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
+++ b/src/core/NEON/kernels/NEStackLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,8 @@
 #ifndef ARM_COMPUTE_NESTACKLAYERKERNEL_H
 #define ARM_COMPUTE_NESTACKLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
index 243a60f249..ac04a1076d 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
@@ -21,18 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
+#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/helpers/bit_ops.h"
+#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/bit_ops.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NEStridedSliceKernel.h b/src/core/NEON/kernels/NEStridedSliceKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEStridedSliceKernel.h
rename to src/core/NEON/kernels/NEStridedSliceKernel.h
index be55fd75de..9ce517417d 100644
--- a/arm_compute/core/NEON/kernels/NEStridedSliceKernel.h
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
 #define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
 
diff --git a/src/core/NEON/kernels/NETableLookupKernel.cpp b/src/core/NEON/kernels/NETableLookupKernel.cpp
index d26a0eedb5..19ce7f0352 100644
--- a/src/core/NEON/kernels/NETableLookupKernel.cpp
+++ b/src/core/NEON/kernels/NETableLookupKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+#include "src/core/NEON/kernels/NETableLookupKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
diff --git a/arm_compute/core/NEON/kernels/NETableLookupKernel.h b/src/core/NEON/kernels/NETableLookupKernel.h
similarity index 95%
rename from arm_compute/core/NEON/kernels/NETableLookupKernel.h
rename to src/core/NEON/kernels/NETableLookupKernel.h
index 58bfdbeec2..7937999b46 100644
--- a/arm_compute/core/NEON/kernels/NETableLookupKernel.h
+++ b/src/core/NEON/kernels/NETableLookupKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NETABLELOOKUPKERNEL_H
 #define ARM_COMPUTE_NETABLELOOKUPKERNEL_H
 
-#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "src/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
 {
@@ -49,6 +49,8 @@ class NETableLookupKernel : public INESimpleKernel
     NETableLookupKernel(NETableLookupKernel &&) = default;
     /** Allow instances of this class to be moved */
     NETableLookupKernel &operator=(NETableLookupKernel &&) = default;
+    /** Default destructor */
+    ~NETableLookupKernel() = default;
     /** Initialise the kernel's input, lut and output.
      *
      * @param[in]  input  An input tensor. Data types supported: U8/S16.
diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp
index 6b291fdcd6..183bb8db5c 100644
--- a/src/core/NEON/kernels/NEThresholdKernel.cpp
+++ b/src/core/NEON/kernels/NEThresholdKernel.cpp
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+#include "src/core/NEON/kernels/NEThresholdKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NEThresholdKernel.h b/src/core/NEON/kernels/NEThresholdKernel.h
similarity index 90%
rename from arm_compute/core/NEON/kernels/NEThresholdKernel.h
rename to src/core/NEON/kernels/NEThresholdKernel.h
index daad47dbda..6b3b3866b0 100644
--- a/arm_compute/core/NEON/kernels/NEThresholdKernel.h
+++ b/src/core/NEON/kernels/NEThresholdKernel.h
@@ -25,8 +25,8 @@
 #define ARM_COMPUTE_NETHRESHOLDKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -40,14 +40,18 @@ class NEThresholdKernel : public INEKernel
     {
         return "NEThresholdKernel";
     }
-    /** Constructor
-     * Initialize all the pointers to nullptr and parameters to zero.
-     */
+    /** Default constructor */
     NEThresholdKernel();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEThresholdKernel(const NEThresholdKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEThresholdKernel &operator=(const NEThresholdKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEThresholdKernel(NEThresholdKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEThresholdKernel &operator=(NEThresholdKernel &&) = default;
+    /** Default destructor */
+    ~NEThresholdKernel() = default;
     /** Initialise the kernel's input, output and threshold parameters.
      *
      * @param[in]  input  An input tensor. Data type supported: U8
diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp
index cc7655a479..94256dc12d 100644
--- a/src/core/NEON/kernels/NETileKernel.cpp
+++ b/src/core/NEON/kernels/NETileKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NETileKernel.h"
+#include "src/core/NEON/kernels/NETileKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -30,6 +30,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NETileKernel.h b/src/core/NEON/kernels/NETileKernel.h
similarity index 94%
rename from arm_compute/core/NEON/kernels/NETileKernel.h
rename to src/core/NEON/kernels/NETileKernel.h
index 7a3039adc9..8dfea8bc2f 100644
--- a/arm_compute/core/NEON/kernels/NETileKernel.h
+++ b/src/core/NEON/kernels/NETileKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NETILEKERNEL_H
 #define ARM_COMPUTE_NETILEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
@@ -44,7 +44,9 @@ class NETileKernel : public INEKernel
     NETileKernel(NETileKernel &&) = default;
     /** Allow instances of this class to be moved */
     NETileKernel &operator=(NETileKernel &&) = default;
-    const char   *name() const override
+    /** Default destructor */
+    ~NETileKernel() = default;
+    const char *name() const override
     {
         return "NETileKernel";
     }
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 7118e45f1e..134831be4c 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -21,16 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/AccessWindowTranspose.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/src/core/NEON/kernels/NETransposeKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NETransposeKernel.h
rename to src/core/NEON/kernels/NETransposeKernel.h
index 1507a1c1a4..73d2098fb3 100644
--- a/arm_compute/core/NEON/kernels/NETransposeKernel.h
+++ b/src/core/NEON/kernels/NETransposeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NETRANSPOSEKERNEL_H
 #define ARM_COMPUTE_NETRANSPOSEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
index 02cf1334ac..cbdec50a42 100644
--- a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
@@ -21,17 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
+#include "src/core/NEON/kernels/NEUpsampleLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h b/src/core/NEON/kernels/NEUpsampleLayerKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h
rename to src/core/NEON/kernels/NEUpsampleLayerKernel.h
index a1278ea307..7ff797a9f8 100644
--- a/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEUPSAMPLELAYERKERNEL_H
 #define ARM_COMPUTE_NEUPSAMPLELAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index d8191dce53..1ae076153b 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,9 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+#include "src/core/NEON/kernels/NEWarpKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -31,6 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
 
@@ -184,7 +187,7 @@ void NEWarpAffineKernel<interpolation>::warp_undefined(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
+                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -271,7 +274,7 @@ void NEWarpAffineKernel<interpolation>::warp_constant(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
+                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -386,7 +389,7 @@ void NEWarpAffineKernel<interpolation>::warp_replicate(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
+                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -519,7 +522,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_undefined(const Window &window
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
+                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -620,7 +623,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_constant(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
+                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -752,7 +755,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_replicate(const Window &window
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
+                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
diff --git a/arm_compute/core/NEON/kernels/NEWarpKernel.h b/src/core/NEON/kernels/NEWarpKernel.h
similarity index 97%
rename from arm_compute/core/NEON/kernels/NEWarpKernel.h
rename to src/core/NEON/kernels/NEWarpKernel.h
index 21fc7b2df1..2c4cb55e3c 100644
--- a/arm_compute/core/NEON/kernels/NEWarpKernel.h
+++ b/src/core/NEON/kernels/NEWarpKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NEWARPKERNEL_H
 #define ARM_COMPUTE_NEWARPKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <array>
 #include <cstdint>
@@ -47,6 +47,8 @@ class INEWarpKernel : public INEKernel
     INEWarpKernel(INEWarpKernel &&) = default;
     /** Allow instances of this class to be moved */
     INEWarpKernel &operator=(INEWarpKernel &&) = default;
+    /** Default destructor */
+    ~INEWarpKernel() = default;
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in]  input                 Source tensor. Data type supported: U8.
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 6a74914ff7..118655b755 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -21,10 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/src/core/NEON/kernels/NEWeightsReshapeKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
rename to src/core/NEON/kernels/NEWeightsReshapeKernel.h
index 8cb3ed8796..9678b79fda 100644
--- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
 #define ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
index 171f5965a5..b5afeed1f6 100644
--- a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
@@ -21,18 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstdint>
 
diff --git a/arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
rename to src/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
index 64d741deab..81b4cbed9e 100644
--- a/arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
+++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
@@ -25,8 +25,8 @@
 #ifndef ARM_COMPUTE_NEWIDTHCONCATENATELAYERKERNEL_H
 #define ARM_COMPUTE_NEWIDTHCONCATENATELAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
index bfe97bfbdb..211ebdec90 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/NEON/kernels/convolution/common/utils.hpp"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/MemorySupport.h"
 
 #include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
index 94df4f6952..2b87e512dc 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
 #define ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/convolution/common/convolution.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/tensor.hpp"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/kernels/convolution/common/convolution.hpp"
+#include "src/core/NEON/kernels/convolution/common/tensor.hpp"
 
 #include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
 
diff --git a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
index b61633dc30..33bcc20d39 100644
--- a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
@@ -21,19 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
+#include "src/core/NEON/kernels/NEYOLOLayerKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h b/src/core/NEON/kernels/NEYOLOLayerKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h
rename to src/core/NEON/kernels/NEYOLOLayerKernel.h
index 8795e4aa56..806cf9cc09 100644
--- a/arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h
+++ b/src/core/NEON/kernels/NEYOLOLayerKernel.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_NEYOLOLAYERKERNEL_H
 #define ARM_COMPUTE_NEYOLOLAYERKERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/core/NEON/kernels/activation/impl/fp16_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/fp16_neon_activation.cpp
new file mode 100644
index 0000000000..58e1cfcf23
--- /dev/null
+++ b/src/core/NEON/kernels/activation/impl/fp16_neon_activation.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/NEON/NEMath.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/StdTypes.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+#ifndef __aarch64__
+inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
+{
+    auto int_in = vreinterpretq_u16_f16(in);
+    return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
+}
+#endif /* __arch64__ */
+} // namespace
+
+void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    /** NEON vector tag type. */
+    using ExactTagType                                = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
+    const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+    constexpr int window_step_x  = 8;
+    const auto    window_start_x = static_cast<int>(window.x().start());
+    const auto    window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    // In case of non-aarch64, a small delta value is added to the input
+    // to prevent NAN values caused by zeros in inputs to SQRT.
+    // In case of aarh64, we call vsqrt directly, so we don't use delta.
+#ifndef __aarch64__
+    const auto delta = wrapper::vdup_n(static_cast<float16_t>((1e-7), ExactTagType {});
+#endif /* __aarch64 */
+
+                                       const auto const_1     = wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType {});
+                                       const auto const_0     = wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
+                                       const auto const_6     = wrapper::vdup_n(static_cast<float16_t>(6.f), ExactTagType{});
+                                       const auto const_3     = wrapper::vdup_n(static_cast<float16_t>(3.f), ExactTagType{});
+                                       const auto const_inv_6 = wrapper::vdup_n(static_cast<float16_t>(0.166666667f), ExactTagType{});
+
+                                       const auto va = wrapper::vdup_n(static_cast<float16_t>(act_info.a()), ExactTagType{});
+                                       const auto vb = wrapper::vdup_n(static_cast<float16_t>(act_info.b()), ExactTagType{});
+                                       const auto a  = static_cast<float16_t>(act_info.a());
+                                       const auto b  = static_cast<float16_t>(act_info.b());
+                                       execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<float16_t, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = wrapper::vabs(vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = wrapper::vmla(vb, va, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = wrapper::vmax(const_0, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+#ifdef __aarch64__
+                    tmp = wrapper::vsqrt(vin);
+#else  /* aarch64 */
+                    {
+                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0, ExactTagType{}));
+                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
+                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
+                    }
+#endif /* aarch64 */
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = wrapper::vmul(vin, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = vin;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const float16_t in = *(reinterpret_cast<const float16_t *>(input_ptr + x));
+            float16_t       tmp;
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = std::abs(in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = a * in + b;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = static_cast<float16_t>(1) / (static_cast<float16_t>(1) + std::exp(-in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = std::max<float16_t>(static_cast<float16_t>(0), in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = std::min<float16_t>(a, std::max(static_cast<float16_t>(0), in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = std::min<float16_t>(a, std::max<float16_t>(b, in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = (in > 0) ? in : a * in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = std::log(static_cast<float16_t>(1) + std::exp(in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+                    tmp = std::sqrt(in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = in * in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = a * std::tanh(b * in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
diff --git a/src/core/NEON/kernels/activation/impl/fp32_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/fp32_neon_activation.cpp
new file mode 100644
index 0000000000..610db05224
--- /dev/null
+++ b/src/core/NEON/kernels/activation/impl/fp32_neon_activation.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/StdTypes.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+#ifndef __aarch64__
+inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask)
+{
+    auto int_in = vreinterpretq_u32_f32(in);
+    return vreinterpretq_f32_u32(wrapper::vand(int_in, mask));
+}
+#endif /* __arch64__ */
+} // namespace
+
+void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    /** NEON vector tag type. */
+    using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
+
+    constexpr int                                 window_step_x  = 4;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    // In case of non-aarch64, a small delta value is added to the input
+    // to prevent NAN values caused by zeros in inputs to SQRT.
+    // In case of aarh64, we call vsqrt directly, so we don't use delta.
+#ifndef __aarch64__
+    const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {});
+#endif /* __aarch64 */
+    const auto const_1     = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {});
+    const auto const_0     = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
+    const auto const_6     = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{});
+    const auto const_3     = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{});
+    const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{});
+
+    const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{});
+    const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{});
+    const auto a  = static_cast<float>(act_info.a());
+    const auto b  = static_cast<float>(act_info.b());
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = wrapper::vabs(vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = wrapper::vmla(vb, va, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = wrapper::vmax(const_0, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+#ifdef __aarch64__
+                    tmp = wrapper::vsqrt(vin);
+#else  /* aarch64 */
+                    {
+                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
+                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
+                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
+                    }
+#endif /* aarch64 */
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = wrapper::vmul(vin, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = vin;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const float in = *(reinterpret_cast<const float *>(input_ptr + x));
+            float       tmp;
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = std::abs(in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = a * in + b;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = std::max<float>(static_cast<float>(0), in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = std::min<float>(a, std::max(static_cast<float>(0), in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = std::min<float>(a, std::max<float>(b, in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = (in > 0) ? in : a * in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = std::log(static_cast<float>(1) + std::exp(in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+                    tmp = std::sqrt(in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = in * in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = a * std::tanh(b * in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/activation/impl/list.h b/src/core/NEON/kernels/activation/impl/list.h
new file mode 100644
index 0000000000..3b48ee3e22
--- /dev/null
+++ b/src/core/NEON/kernels/activation/impl/list.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
+#define SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_ACTIVATION_KERNEL(func_name) \
+    void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+
+DECLARE_ACTIVATION_KERNEL(qasymm8_neon_activation);
+DECLARE_ACTIVATION_KERNEL(qasymm8_signed_neon_activation);
+DECLARE_ACTIVATION_KERNEL(qsymm16_neon_activation);
+DECLARE_ACTIVATION_KERNEL(fp16_neon_activation);
+DECLARE_ACTIVATION_KERNEL(fp32_neon_activation);
+
+#undef DECLARE_ACTIVATION_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H */
diff --git a/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp
new file mode 100644
index 0000000000..8a398fb531
--- /dev/null
+++ b/src/core/NEON/kernels/activation/impl/qasymm8_neon_activation.cpp
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/StdTypes.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    constexpr int                                 window_step_x  = 16;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
+    const qasymm8x16_t            va              = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
+    const qasymm8x16_t            vb              = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
+    const qasymm8_t               a               = quantize_qasymm8(act_info.a(), qi_in);
+    const qasymm8_t               b               = quantize_qasymm8(act_info.b(), qi_in);
+    const qasymm8_t               const_0         = quantize_qasymm8(0.f, qi_in);
+    const qasymm8x16_t            vconst_0        = vdupq_n_u8(const_0);
+    const auto                    vconst_1        = vdupq_n_f32(1.f);
+    const float32x4_t             va_f32          = vdupq_n_f32(act_info.a());
+    const float32x4_t             vb_f32          = vdupq_n_f32(act_info.b());
+    const float                   a_f32           = act_info.a();
+    const float                   b_f32           = act_info.b();
+    const auto                    const_6_f32     = vdupq_n_f32(6.f);
+    const auto                    const_0_f32     = vdupq_n_f32(0.f);
+    const auto                    const_3_f32     = vdupq_n_f32(3.f);
+    const auto                    const_inv_6_f32 = vdupq_n_f32(0.166666667f);
+
+    // Initialise scale/offset for re-quantization
+    float       s  = qi_in.scale / qi_out.scale;
+    float       o  = -qi_in.offset * s + qi_out.offset;
+    float32x4_t vs = vdupq_n_f32(s);
+    float32x4_t vo = vdupq_n_f32(o);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                // Perform activation
+                tmp = vmaxq_u8(vconst_0, vin);
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_u8(va, vmaxq_u8(vb, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize(tmp_dep, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+            qasymm8_t tmp = 0;
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                tmp = std::max(const_0, in);
+                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(const_0, in));
+                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(b, in));
+                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                float tmp_f = dequantize_qasymm8(in, qi_in);
+                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                tmp         = quantize_qasymm8(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                float tmp_f = dequantize_qasymm8(in, qi_in);
+                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                tmp         = quantize_qasymm8(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                float tmp_f = dequantize_qasymm8(in, qi_in);
+                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                tmp         = quantize_qasymm8(tmp_f, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp
new file mode 100644
index 0000000000..bfab07c8e3
--- /dev/null
+++ b/src/core/NEON/kernels/activation/impl/qasymm8_signed_neon_activation.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/StdTypes.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    constexpr int                                 window_step_x  = 16;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
+    const qasymm8x16_signed_t     va              = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
+    const qasymm8x16_signed_t     vb              = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
+    const qasymm8_signed_t        a               = quantize_qasymm8_signed(act_info.a(), qi_in);
+    const qasymm8_signed_t        b               = quantize_qasymm8_signed(act_info.b(), qi_in);
+    const qasymm8_signed_t        const_0         = quantize_qasymm8_signed(0.f, qi_in);
+    const qasymm8x16_signed_t     vconst_0        = vdupq_n_s8(const_0);
+    const auto                    vconst_1        = vdupq_n_f32(1.f);
+    const float32x4_t             va_f32          = vdupq_n_f32(act_info.a());
+    const float32x4_t             vb_f32          = vdupq_n_f32(act_info.b());
+    const float                   a_f32           = act_info.a();
+    const float                   b_f32           = act_info.b();
+    const auto                    const_6_f32     = vdupq_n_f32(6.f);
+    const auto                    const_0_f32     = vdupq_n_f32(0.f);
+    const auto                    const_3_f32     = vdupq_n_f32(3.f);
+    const auto                    const_inv_6_f32 = vdupq_n_f32(0.166666667f);
+
+    // Initialise scale/offset for re-quantization
+    float       s  = qi_in.scale / qi_out.scale;
+    float       o  = -qi_in.offset * s + qi_out.offset;
+    float32x4_t vs = vdupq_n_f32(s);
+    float32x4_t vo = vdupq_n_f32(o);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                // Perform activation
+                tmp = vmaxq_s8(vconst_0, vin);
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_s8(va, vmaxq_s8(vb, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_signed(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_signed(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_signed(tmp_dep, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+            qasymm8_signed_t tmp = 0;
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                tmp = std::max(const_0, in);
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(const_0, in));
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(b, in));
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/activation/impl/qsymm16_neon_activation.cpp b/src/core/NEON/kernels/activation/impl/qsymm16_neon_activation.cpp
new file mode 100644
index 0000000000..0bef807db9
--- /dev/null
+++ b/src/core/NEON/kernels/activation/impl/qsymm16_neon_activation.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/StdTypes.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qsymm16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    constexpr int                                 window_step_x  = 8;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const auto                    vconst_1 = vdupq_n_f32(1.f);
+    const float32x4_t             va_f32   = vdupq_n_f32(act_info.a());
+    const float32x4_t             vb_f32   = vdupq_n_f32(act_info.b());
+    const float                   a_f32    = act_info.a();
+    const float                   b_f32    = act_info.b();
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
+        ARM_COMPUTE_UNUSED(tmp);
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                // Perform activation
+                const float32x4x2_t tmp_dep =
+                {
+                    {
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_int16(tmp_dep, qi_out.scale);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                // Perform activation
+                const float32x4x2_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_int16(tmp_dep, qi_out.scale);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
+            qsymm16_t tmp = 0;
+            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                tmp         = quantize_qsymm16(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                tmp         = quantize_qsymm16(tmp_f, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm_gemm/convolver.hpp b/src/core/NEON/kernels/arm_gemm/convolver.hpp
new file mode 100644
index 0000000000..879d95f5bb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/convolver.hpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "convolution_parameters.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <tuple>
+#include <vector>
+
+namespace arm_gemm {
+
+// Class to assist with convolution calculations.
+//
+// This is framed as a hierarchy of objects:
+//
+//  - Top level object which depends only on convolution parameters.  This sets up std::vectors for the padding and
+//    kernel offset arrays.  From this you can request:
+//
+//  - Mid level object (e.g. instantiated at start of 'ConvolutionInterleave').  This holds specifics about the
+//    input tensor, and the desired column range.  Calculations specific to this can be done once when this is set
+//    up.  From this you can request:
+//
+//  - Low level object (instantiated for each range of rows).  This contains methods to actually populate a row
+//    pointer array.
+
+
+template<typename T>
+class convolver {
+private:
+    const ConvolutionParameters  m_params;
+
+    // Vector of padding data
+    const std::vector<T>         m_pad_row;
+
+    // X/Y offsets for each kernel position
+    std::vector<int>             m_kernel_y;
+    std::vector<int>             m_kernel_x;
+
+    class column_handler {
+    private:
+        const convolver<T>          &m_parent;
+
+        // Base/stride of input image
+        const T * const              m_input_base;
+        const size_t                 m_input_stride;
+
+        // Starting kernel point and channel offset within that point
+        const unsigned int           m_start_pos;
+        const unsigned int           m_start_offset;
+
+        // Total length to process, rounded length of each input channel block.
+        const unsigned int           m_length;
+        const unsigned int           m_rounded_stringlen;
+
+        class row_handler {
+        private:
+            const convolver<T>          &m_convolver;
+            const column_handler        &m_parent;
+
+            // These variables track progress through the current block of rows
+            unsigned int                 m_start_output_y=0;
+            unsigned int                 m_start_output_x=0;
+
+            unsigned int                 m_length_remaining=0;
+            unsigned int                 m_current_pos=0;
+
+            unsigned int                 m_active_height=0;
+
+        public:
+            row_handler(const column_handler &parent, unsigned int start_row, unsigned int active_height) :
+                m_convolver(parent.m_parent),
+                m_parent(parent),
+                m_start_output_y(start_row / m_convolver.m_params.output_width),
+                m_start_output_x(start_row % m_convolver.m_params.output_width),
+                m_length_remaining(m_parent.m_length),
+                m_current_pos(m_parent.m_start_pos),
+                m_active_height(active_height) { }
+
+            bool finished() const {
+                return (m_length_remaining == 0);
+            }
+
+            std::tuple<unsigned int, unsigned int> next_block(const T ** const row_ptr) {
+                if (finished()) {
+                    return std::make_tuple(0, 0);
+                }
+
+                // "in_width" in the amount of data that will be read in (copied)
+                // "out_width" is the total amount of data that will be produced (including padding)
+                unsigned int offset = (m_current_pos == m_parent.m_start_pos) ? m_parent.m_start_offset : 0;
+                unsigned int in_width = std::min(m_length_remaining, static_cast<unsigned int>(m_convolver.m_params.input_channels) - offset);
+                unsigned int out_width = std::min(m_length_remaining, m_parent.m_rounded_stringlen - offset);
+
+                unsigned int output_y = m_start_output_y;
+                unsigned int output_x = m_start_output_x;
+
+                for (unsigned int row=0; row<m_active_height; row++) {
+                    int input_y = (output_y * m_convolver.m_params.output_stride_h) + m_convolver.m_kernel_y[m_current_pos];
+                    int input_x = (output_x * m_convolver.m_params.output_stride_w) + m_convolver.m_kernel_x[m_current_pos];
+
+                    // Out-of-bounds points will read the padding data,
+                    // otherwise find the correct address in the input image.
+                    if (input_y < 0 || input_y >= m_convolver.m_params.input_height || input_x < 0 || input_x >= m_convolver.m_params.input_width) {
+                        row_ptr[row] = m_convolver.m_pad_row.data();
+                    } else {
+                        row_ptr[row] = m_parent.m_input_base + ((input_y * m_convolver.m_params.input_width) + input_x) * m_parent.m_input_stride;
+                    }
+
+                    output_x++;
+                    if (output_x == m_convolver.m_params.output_width) {
+                        output_y++;
+                        output_x=0;
+                    }
+                }
+
+                m_current_pos++;
+                m_length_remaining-=out_width;
+
+                return std::make_tuple(in_width, offset);
+            }
+        }; // end of "row handler" class
+
+    public:
+        column_handler(const convolver<T> &parent, const T *input_base, size_t input_stride,
+                       unsigned int k_start, unsigned int k_end, unsigned int rounded_stringlen)
+                     : m_parent(parent), m_input_base(input_base), m_input_stride(input_stride),
+                       m_start_pos(k_start / rounded_stringlen),
+                       m_start_offset(k_start % rounded_stringlen),
+                       m_length(k_end - k_start),
+                       m_rounded_stringlen(rounded_stringlen) { }
+
+        row_handler process_rows(unsigned int start_row, unsigned int active_height) const {
+            return row_handler(*this, start_row, active_height);
+        }
+    }; // end of "column handler" class
+
+public:
+    convolver(ConvolutionParameters params) :
+        m_params (params), m_pad_row(params.input_channels, static_cast<T>(params.padding_value)),
+        m_kernel_y(params.kernel_width * params.kernel_height, 0),
+        m_kernel_x(params.kernel_width * params.kernel_height, 0) {
+
+        // Kernel points are addressed across, then down (assumed weight layout is WHIO)
+        for (unsigned int ky=0; ky<params.kernel_height; ky++) {
+            for (unsigned int kx=0; kx<params.kernel_width; kx++) {
+                unsigned int n = (ky * params.kernel_width) + kx;
+                m_kernel_y[n] = ky - params.padding_top;
+                m_kernel_x[n] = kx - params.padding_left;
+            }
+        }
+    }
+
+    column_handler process_columns(const T *input_base, size_t input_stride,
+                                   unsigned int k_start, unsigned int k_end, unsigned int rounded_stringlen) const {
+        return column_handler(*this, input_base, input_stride, k_start, k_end, rounded_stringlen);
+    }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
index f3b66528a4..96b9734221 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
@@ -25,93 +25,78 @@
 #include "bfloat.hpp"
 #include "gemm_common.hpp"
 #include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
 #include "gemv_batched.hpp"
 #include "gemv_pretransposed.hpp"
 
-#include "kernels/a64_interleaved_bf16fp32_dot_12x8.hpp"
-#include "kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp"
-#include "kernels/a64_sgemm_12x8.hpp"
+#include "kernels/a64_hybrid_bf16fp32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_bf16fp32_dot_8x12.hpp"
+#include "kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp"
+#include "kernels/a64_sgemm_8x12.hpp"
 #include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp"
-#include "kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp"
-#include "kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp"
-#include "kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp"
-#include "kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp"
-#include "kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp"
+#include "kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp"
+#include "kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<bfloat16, float> gemm_bf16_methods[] =
 {
 #ifdef V8P6_BF
-# ifdef __ARM_FEATURE_SVE
-{
-    GemmMethod::GEMM_HYBRID,
-    "hybrid_bf16fp32_mmla_6VLx2",
-    [](const GemmArgs &args) { return (args._Ksize>=8); },
-    [](const GemmArgs &args) { return ((args._Msize <= 4) && (args._Nsize <= hybrid_bf16fp32_mmla_6VLx2::out_width())); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_mmla_6VLx2, bfloat16, float>(args); }
-},
-{
-    GemmMethod::GEMM_HYBRID,
-    "hybrid_bf16fp32_mmla_8VLx2",
-    [](const GemmArgs &args) { return (args._Ksize>=8); },
-    [](const GemmArgs &args) { return (args._Msize <= 4); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_mmla_8VLx2, bfloat16, float>(args); }
-},
-{
-    GemmMethod::GEMM_HYBRID,
-    "hybrid_bf16fp32_mmla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize>=8); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_mmla_4VLx4, bfloat16, float>(args); }
-},
-{
-    GemmMethod::GEMM_HYBRID,
-    "hybrid_bf16fp32_dot_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize>=8); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_dot_4VLx4, bfloat16, float>(args); }
-},
+#ifdef __ARM_FEATURE_SVE
 { // gemm_bf16_interleaved
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_bf16fp32_mmla_3VLx8",
+    "sve_interleaved_bf16fp32_mmla_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_bf16fp32_mmla_3VLx8, bfloat16, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, float>(args); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_bf16fp32_dot_6x4VL",
+    nullptr,
+    [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_bf16fp32_dot_6x4VL, bfloat16, float>(args); }
 },
 { // gemm_bf16_interleaved
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_bf16fp32_dot_3VLx8",
+    "sve_interleaved_bf16fp32_dot_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>2); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_bf16fp32_dot_3VLx8, bfloat16, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_dot_8x3VL, bfloat16, float>(args); }
 },
 # endif // SVE
 { // gemm_bf16_interleaved
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_bf16fp32_mmla_12x8",
+    "a64_interleaved_bf16fp32_mmla_8x12",
     [](const GemmArgs &args) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_bf16fp32_mmla_12x8, bfloat16, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, bfloat16, float>(args); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_bf16fp32_dot_6x16",
+    nullptr,
+    nullptr,
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_bf16fp32_dot_6x16, bfloat16, float>(args); }
 },
 { // gemm_bf16_interleaved
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_bf16fp32_dot_12x8",
+    "a64_interleaved_bf16fp32_dot_8x12",
     [](const GemmArgs &args) { return (args._Ksize>2); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_bf16fp32_dot_12x8, bfloat16, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
 },
 #endif // V8P6_BF
 #ifdef __aarch64__
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "sgemm_12x8",
+    "a64_sgemm_8x12",
     nullptr,
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, bfloat16, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, bfloat16, float>(args); }
 },
 #elif defined(__arm__)
 {
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index 91012218e5..75524fff97 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -29,15 +29,17 @@
 
 #include "gemm_common.hpp"
 #include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
 #include "gemm_interleaved_pretransposed_2d.hpp"
 
 #include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/a64_hgemm_24x8.hpp"
-#include "kernels/a64_sgemm_12x8.hpp"
-#include "kernels/sve_hybrid_fp16_mla_4VLx4.hpp"
-#include "kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
+#include "kernels/a64_hgemm_8x24.hpp"
+#include "kernels/a64_hybrid_fp16_mla_6x32.hpp"
+#include "kernels/a64_sgemm_8x12.hpp"
+#include "kernels/sve_hybrid_fp16_mla_6x4VL.hpp"
+#include "kernels/sve_interleaved_fp16_mla_8x3VL.hpp"
 
 namespace arm_gemm {
 
@@ -45,61 +47,51 @@ static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
 #if defined(__ARM_FEATURE_SVE)
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_fp16_mla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize >= 8); },
+    "sve_hybrid_fp16_mla_6x4VL",
+    nullptr,
     [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp16_mla_4VLx4, __fp16, __fp16>(args); }
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_fp16_mla_3VLx8",
+    "sve_interleaved_fp16_mla_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize > 4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp16_mla_8x3VL, __fp16, __fp16>(args); }
 },
 #endif
 
 #if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
-{
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "hgemm_24x8_2d",
+GemmImplementation<__fp16, __fp16>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_fp16_mla_6x32",
 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     [](const GemmArgs &args) { return args._ci->has_fp16(); },
 #else
     nullptr,
 #endif
-    [](const GemmArgs &args) { return args._maxthreads >= 8; },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<hgemm_24x8, __fp16, __fp16>(args); }
-},
-{
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>::estimate_cycles(args, cls_a64_hybrid_fp16_mla_6x32::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>(args); }
+),
+GemmImplementation<__fp16, __fp16>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
-    "hgemm_24x8_1d",
+    "a64_hgemm_8x24",
 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     [](const GemmArgs &args) { return args._ci->has_fp16(); },
 #else
     nullptr,
 #endif
-    nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args); }
-},
-
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16>::estimate_cycles(args, cls_a64_hgemm_8x24::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16>(args); }
+),
 #endif // aarch64 && FP16
 #ifdef __aarch64__
-//Pretranpose, 2D split
-{
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "sgemm_12x8_2d",
-    nullptr,
-    [](const GemmArgs &args) { return args._maxthreads >= 8; },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, __fp16, __fp16>(args); }
-},
-//Tranpose, 1D split, with blockmanager
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "sgemm_12x8_1d",
-    nullptr,
+    "a64_sgemm_8x12",
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(args); }
+    [](const GemmArgs &args) { return !args._ci->has_fp16(); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, __fp16, __fp16>(args); }
 },
 #elif defined(__arm__)
 {
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index ddb438f06c..e9e335f500 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -24,6 +24,7 @@
 #include "arm_gemm.hpp"
 #include "gemm_common.hpp"
 #include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
 #include "gemm_interleaved_pretransposed_2d.hpp"
@@ -31,127 +32,130 @@
 #include "gemv_pretransposed.hpp"
 
 #include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/a64_hybrid_fp32_mla_16x4.hpp"
-#include "kernels/a64_hybrid_fp32_mla_4x8.hpp"
-#include "kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp"
-#include "kernels/a64_sgemm_12x8.hpp"
-#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_gemv_fp32_mla_32.hpp"
+#include "kernels/a64_hybrid_fp32_mla_6x16.hpp"
+#include "kernels/a64_hybrid_fp32_mla_8x4.hpp"
+#include "kernels/a64_sgemm_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp"
 
-#include "kernels/sve_hybrid_fp32_mla_4VLx4.hpp"
-#include "kernels/sve_hybrid_fp32_mmla_4VLx4.hpp"
-#include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
-#include "kernels/sve_interleaved_fp32_mmla_3VLx8.hpp"
-#include "kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp"
+#include "kernels/sve_gemv_fp32_mla_8VL.hpp"
+#include "kernels/sve_hybrid_fp32_mla_6x4VL.hpp"
+#include "kernels/sve_hybrid_fp32_mla_8x1VL.hpp"
+#include "kernels/sve_interleaved_fp32_mla_8x3VL.hpp"
+#include "kernels/sve_interleaved_fp32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<float, float> gemm_fp32_methods[] =
 {
+// GEMV cases - starting with 'gemv_batched' wrapper to turn batched GEMV into GEMM.
 {
     GemmMethod::GEMV_BATCHED,
     "gemv_batched",
-    [](const GemmArgs &args) { return (args._Msize==1) && (args._nbatches>1); },
+    [](const GemmArgs &args) { return args._Msize==1 && args._nbatches>1 && !args._indirect_input; },
     nullptr,
     [](const GemmArgs &args) { return new GemvBatched<float, float>(args); }
 },
 #ifdef __aarch64__
+#ifdef __ARM_FEATURE_SVE
 {
-    GemmMethod::GEMV_PRETRANSPOSED,
-    "sgemv_pretransposed",
-    [](const GemmArgs &args) { return (args._Msize==1 && args._nbatches==1); },
+    GemmMethod::GEMM_HYBRID,
+    "sve_gemv_fp32_mla_8VL",
+    [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemvPretransposed<sgemv_pretransposed, float, float>(args); }
+    [](const GemmArgs &args) { return new GemvPretransposed<cls_sve_gemv_fp32_mla_8VL, float, float>(args); }
 },
-#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32)
+#endif
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_fp32_mmla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize >= 4); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mmla_4VLx4, float, float>(args); }
+    "a64_gemv_fp32_mla_32",
+    [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args) { return new GemvPretransposed<cls_a64_gemv_fp32_mla_32, float, float>(args); }
 },
+
+// MMLA next due to higher throughput (SVE only)
+#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32)
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_fp32_mmla_3VLx8",
+    "sve_interleaved_fp32_mmla_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mmla_3VLx8, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mmla_8x3VL, float, float>(args); }
 },
 #endif // __ARM_FEATURE_SVE && MMLA_FP32
 
 #ifdef __ARM_FEATURE_SVE
-// SVE smallk /  hybrid methods
+// SVE smallk / hybrid methods
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_fp32_mla_1VLx8",
-    [](const GemmArgs &args) { return (args._Ksize <= 24); },
+    "sve_smallK_hybrid_fp32_mla_8x1VL",
+    [](const GemmArgs &args) { return args._Ksize <= 24 && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_1VLx8, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_fp32_mla_8x1VL, float, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_fp32_mla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize >= 4); },
+    "sve_hybrid_fp32_mla_8x1VL",
+    nullptr,
+    [](const GemmArgs &args) { return (args._Nsize < 12); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_8x1VL, float, float>(args); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_fp32_mla_6x4VL",
+    nullptr,
     [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_4VLx4, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float>(args); }
 },
 #endif // __ARM_FEATURE_SVE
 
 // NEON hybrid methods
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_fp32_mla_4x8",
-    [](const GemmArgs &args) { return (args._Ksize <= 8) && (args._Nsize % 4)==0; },
+    "a64_smallK_hybrid_fp32_mla_8x4",
+    [](const GemmArgs &args) { return args._Ksize <= 8 && (args._Nsize % 4)==0 && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_4x8, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_fp32_mla_8x4, float, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_fp32_mla_4x6",
-    [](const GemmArgs &args) { return (args._Ksize > 8) && (args._Ksize <= 16) && (args._Nsize % 4)==0; },
+    "a64_smallK_hybrid_fp32_mla_6x4",
+    [](const GemmArgs &args) { return (args._Ksize > 8 && args._Ksize <= 16) && (args._Nsize % 4)==0 && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_4x6, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_fp32_mla_6x4, float, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_fp32_mla_4x8_normal",
-    [](const GemmArgs &args) { return (args._Ksize >= 4); },
+    "a64_hybrid_fp32_mla_8x4",
+    nullptr,
     [](const GemmArgs &args) { return (args._Nsize < 12); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_4x8, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_8x4, float, float>(args); }
 },
 GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
-    "hybrid_fp32_mla_16x4",
-    [](const GemmArgs &args) { return (args._Ksize >= 4); },
-    [](const GemmArgs &args) { return GemmHybrid<hybrid_fp32_mla_16x4, float, float>::estimate_cycles(args, hybrid_fp32_mla_16x4::get_performance_parameters(args._ci)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_16x4, float, float>(args); }
+    "a64_hybrid_fp32_mla_6x16",
+    nullptr,
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>::estimate_cycles(args, cls_a64_hybrid_fp32_mla_6x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>(args); }
 ),
-
 #ifdef __ARM_FEATURE_SVE
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_fp32_mla_3VLx8",
+    "sve_interleaved_fp32_mla_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>(args); }
 },
 #endif // __ARM_FEATURE_SVE
-// Pretranposed, 2D split
-GemmImplementation<float, float>::with_estimate(
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "sgemm_12x8_2d",
-    nullptr,
-    [](const GemmArgs &args) { return GemmInterleavedPretransposed2d<sgemm_12x8, float, float>::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, float, float>(args); }
-),
-// 1D split (with pretransposed or not)
 GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
-    "sgemm_12x8_1d",
+    "a64_sgemm_8x12",
     nullptr,
-    [](const GemmArgs &args) { return GemmInterleaved<sgemm_12x8, float, float>::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); },
-    [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); }
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, float, float>::estimate_cycles(args, cls_a64_sgemm_8x12::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, float, float>(args); }
 ),
 #endif // __aarch64__
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index 7a983ed6ac..d702cffce1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -77,51 +77,43 @@ class GemmHybrid : public GemmCommon<To, Tr> {
             return args._cfg->inner_block_size;
         }
 
-        const unsigned int L1_size = args._ci->get_L1_cache_size();
+        // Target block size (512 for FP32, scaling for other types).  Don't block until size reaches 1.5X this.
+        unsigned int target_block_size = 2048 / sizeof(To);
 
-        // k_block: Find out how much of the larger array can be loaded into half the cache.
-        // This should account for associative caches.
-        unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+        if (args._Ksize >= ((3 * target_block_size) / 2)) {
+            unsigned int target_blocks = iceildiv(args._Ksize, target_block_size);
 
-        // Needs to be (at least a single) multiple of the K unroll level.
-        k_block /= strategy::k_unroll();
-        k_block = std::max(k_block, 1U) * strategy::k_unroll();
+            unsigned int block_size = iceildiv(args._Ksize, target_blocks);
 
-        // Now tune to presented problem size; this is how many blocks we need.
-        unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+            block_size = roundup(block_size, strategy::k_unroll());
 
-        // So divide the space equally into that many blocks.
-        k_block = iceildiv(args._Ksize, numk_blocks);
-
-        // And round UP to the K unroll level required.
-        k_block = roundup(k_block, strategy::k_unroll());
+            return block_size;
+        }
 
-        return k_block;
+        return args._Ksize;
     }
 
+    // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width.  Otherwise do a
+    // single block.
     static unsigned int compute_n_block(const GemmArgs &args) {
         if (args._cfg && args._cfg->outer_block_size) {
             return args._cfg->outer_block_size;
         }
 
-        const unsigned int k_block = compute_k_block(args);
-        const unsigned int L2_size = args._ci->get_L2_cache_size();
-
-        // n_block: Work out how many rows (of length k_block) will fit in the L2
-        // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-        unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
-                                 (sizeof(Toi) * k_block);
+        if (args._Nsize <= 64) {
+            return args._Nsize;
+        }
 
-        // Needs to be (at least a single) multiple of the kernel output width.
-        n_block /= strategy::out_width();
-        n_block = std::max(n_block, 1U) * strategy::out_width();
+        if ((args._Msize / args._Nsize) > 155) {
+            return args._Nsize;
+        }
 
-        // And tune to the presented problem size.
-        unsigned int numblocks = iceildiv(args._Nsize, n_block);
-        n_block = iceildiv(args._Nsize, numblocks);
-        n_block = roundup(n_block, strategy::out_width());
+        // Go slightly wider if thread count and depth are small.
+        if ((args._Ksize <= 128) && (args._maxthreads <= 16)) {
+            return strategy::out_width() * 3;
+        }
 
-        return n_block;
+        return strategy::out_width();
     }
 
 public:
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
new file mode 100644
index 0000000000..eede1a4f76
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -0,0 +1,621 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <alloca.h>
+
+#include <algorithm>
+#include <cassert>
+
+#include "arm_gemm.hpp"
+#include "bias_adder.hpp"
+#include "convolver.hpp"
+#include "ndrange.hpp"
+#include "performance_parameters.hpp"
+#include "transform.hpp"
+#include "utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+#ifndef UNUSED
+#define __I_DEFINED_UNUSED
+#define UNUSED(x)  ((void)(x))
+#endif
+
+namespace arm_gemm {
+
+namespace {
+
+// We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do
+// that.
+
+template<typename OutputStage, bool SeparateQuantize = false>
+class run_hybrid_kernel {
+public:
+    template<typename strategy, typename To, typename Tr>
+    static void run (
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+        const OutputStage &os, const int32_t *col_bias, unsigned int n_0 );
+};
+
+template<>
+template<typename strategy, typename To, typename Tr>
+void run_hybrid_kernel<Nothing, false>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+        const Nothing &, const int32_t *, unsigned int) {
+#ifdef CYCLE_PROFILING
+    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+    UNUSED(kern_k);
+
+    strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, bias_ptr, act, accumulate);
+}
+
+template<>
+template<typename strategy, typename To, typename Tr>
+void run_hybrid_kernel<Requantize32, false>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+        const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
+#ifdef CYCLE_PROFILING
+    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+    UNUSED(kern_k);
+
+    strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, &os, col_bias + n_0, n_0);
+}
+
+template<>
+template<typename strategy, typename To, typename Tr>
+void run_hybrid_kernel<Requantize32, true>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+        const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
+    UNUSED(kern_k);
+    // On this route we will only process one kernel height at a time and will make sure this happens in the driver loop.
+    assert(M <= strategy::out_height());
+    // We don't yet support indirect output (as the quantizer can't do it).
+    assert(output_arg.is_indirect == false);
+
+    // We need a row sum buffer and intermediate output buffer.
+    // These go on the stack as they are not too large, using an automatic array and alloca() respectively.
+    int32_t row_sums[strategy::out_height()];
+    typename strategy::result_type *result_buffer;
+
+    unsigned int output_width = roundup(N, strategy::out_width());
+
+    result_buffer = reinterpret_cast<typename strategy::result_type *>(alloca(output_width * strategy::out_height() * sizeof(typename strategy::result_type)));
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+        // Perform the GEMM, into the output buffer.
+        strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, IndirectOutputArg<typename strategy::result_type>(result_buffer, output_width), nullptr, Activation(), false);
+    }
+
+    if (os.b_offset != 0) {
+#ifdef CYCLE_PROFILING
+        auto p = prof.ScopedProfiler(PROFILE_ROWSUMS, (unsigned long)M * kern_k);
+#endif
+        row_sums_indirect(num_strings, string_ptr, A_arg, M, row_sums, &os);
+    } else {
+        memset(row_sums, 0, sizeof(int32_t) * strategy::out_height());
+    }
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p = prof.ScopedProfiler(PROFILE_QUANTIZE, (unsigned long)M * N);
+#endif
+        // Quantize
+        requantize_block_32(os, N, M, result_buffer, output_width, output_arg.direct.base, output_arg.direct.stride, row_sums, col_bias + n_0, n_0);
+    }
+}
+
+} // anonymous namespace
+
+// Implementation of the GemmCommon abstract class.
+template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false>
+class GemmHybridIndirect : public GemmCommon<To, Tr> {
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type Tri;
+
+    GemmArgs           _args;
+    OutputStage        _os = {};
+
+    /* Quantized support (in addition to 'output stage' above) */
+    int32_t *_col_bias = nullptr;
+
+    const unsigned int _Ktotal;
+    const unsigned int _rounded_Ksize;
+
+    /* Blocking info */
+    const unsigned int _k_block;
+    const unsigned int _n_block;
+    const unsigned int _Mround;
+
+    /* Pretransposed buffer. */
+    const Toi *_B_transposed=nullptr;
+
+    /* Indirect parameters.  _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
+    const To * const * const * _indirect_buf = nullptr;
+
+    /* Convolver - only set up for convolution problems, so also doubles as a flag. */
+    std::unique_ptr<convolver<To>>  _convolver = nullptr;
+
+    // Array of pointers to output rows
+//    Tr * const *        _output_ptrs;
+
+    const NDRange<4> _window_range;
+
+    unsigned int get_col_sum_size() const {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            return _args._Nsize * _args._nmulti * sizeof(int32_t);
+        } else {
+            return 0;
+        }
+    }
+
+    static unsigned int get_ktotal(const GemmArgs &args) {
+        return args._Ksections * roundup(args._Ksize, strategy::k_unroll());
+    }
+
+    static unsigned int compute_k_block(const GemmArgs &args) {
+        // Some kernels don't support accumulate mode - these can't do K blocking at all.
+        if (!strategy::supports_accumulate() || std::is_same<OutputStage, Requantize32>::value) {
+            return get_ktotal(args);
+        }
+
+        if (args._cfg && args._cfg->inner_block_size) {
+            return args._cfg->inner_block_size;
+        }
+
+        // Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other
+        // datatypes); but don't divide into blocks until we hit 1.5X this size.
+        unsigned int target_block_size = 2048 / sizeof(To);
+        auto ktotal = get_ktotal(args);
+
+        if (ktotal > ((target_block_size*3)/2)) {
+            unsigned int target_blocks = iceildiv(ktotal, target_block_size);
+
+            unsigned int block_size = iceildiv(ktotal, target_blocks);
+
+            block_size = roundup(block_size, strategy::k_unroll());
+
+            return block_size;
+        }
+
+        return ktotal;
+    }
+
+    // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width.  Otherwise do a
+    // single block.
+    static unsigned int compute_n_block(const GemmArgs &args, const OutputStage os = {}) {
+        if (args._cfg && args._cfg->outer_block_size) {
+            return args._cfg->outer_block_size;
+        }
+
+        if (args._Nsize <= 64) {
+            return args._Nsize;
+        }
+
+        if ((args._Msize / args._Nsize) > 155) {
+            return args._Nsize;
+        }
+
+        // "Asymmetric" quantizing GEMMs require a different approach - the tall skinny blocks we would otherwise
+        // use imply a great deal of repeated work performing the row sums.  If row sums are involved, work out how
+        // much "column" parallelism is going to be required and set the block size accordingly.
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&os);
+
+            // Row sums only needed if b_offset isn't 0
+            if (qp->b_offset != 0) {
+                // We can already parallelize across batches, multis and rows (in units of 'out_height')
+                int multi_row_parallelism = args._nmulti * args._nbatches * iceildiv(args._Msize, strategy::out_height());
+
+                // If this isn't enough, we will need to split up the columns too.
+                if (multi_row_parallelism < args._maxthreads) {
+                    unsigned int columns_needed = iceildiv(args._maxthreads, multi_row_parallelism);
+
+                    unsigned int n_block = iceildiv(args._Nsize, columns_needed);
+
+                    return roundup(n_block, strategy::out_width());
+                }
+
+                // Multi/Batch/Row parallelism is enough - don't split up the columns.
+                return args._Nsize;
+            }
+        }
+
+        if (args._Ksize <= 128 && args._maxthreads <= 16) {
+            return strategy::out_width() * 3;
+        }
+
+        return strategy::out_width();
+    }
+
+public:
+    GemmHybridIndirect(GemmHybridIndirect &) = delete;
+    GemmHybridIndirect & operator= (GemmHybridIndirect &) = delete;
+
+    /* Constructor */
+    GemmHybridIndirect(const GemmArgs &args, const OutputStage &os)
+              : _args(args), _os(os), _Ktotal(get_ktotal(args)),
+                _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
+                _k_block(compute_k_block(args)), _n_block(compute_n_block(args, os)),
+                _Mround(roundup(args._Msize, strategy::out_height())),
+                _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
+                              iceildiv(args._Nsize, _n_block), args._nmulti)
+    {
+        // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
+        // GemmConfig.  Clear out the pointer to avoid accidents.
+        _args._cfg = nullptr;
+    }
+
+    /* Constructor without OutputStage */
+    GemmHybridIndirect(const GemmArgs &args)
+              : _args(args), _Ktotal(get_ktotal(args)),
+                _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
+                _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
+                _Mround(roundup(args._Msize, strategy::out_height())),
+                _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
+                              iceildiv(args._Nsize, _n_block), args._nmulti)
+    {
+        // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
+        // GemmConfig.  Clear out the pointer to avoid accidents.
+        _args._cfg = nullptr;
+    }
+
+    // Interface implementation - Compulsory functions
+    ndrange_t get_window_size() const override {
+        return { _window_range.total_size() };
+    }
+
+    // This kernel can always be dynamically scheduled.
+    bool supports_dynamic_scheduling() const override {
+        return true;
+    }
+
+    // Execute
+    void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
+#ifdef CYCLE_PROFILING
+        profiler prof;
+#endif
+        strategy strat(_args._ci);
+
+        std::vector<const To *>         in_row_ptrs;
+        std::vector<const To * const *> in_row_strings;
+        std::vector<unsigned int>       string_lengths;
+
+        // In convolution mode, we need input pointers.
+        if (_convolver) {
+            in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args._Ksections, nullptr);
+            in_row_strings = std::vector<const To * const *>(_args._Ksections, nullptr);
+
+            for (unsigned int i=0; i<_args._Ksections; i++) {
+                in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
+            }
+        }
+
+        // In any indirect mode, we need the string lengths.
+        if (_args._indirect_input) {
+            string_lengths = std::vector<unsigned int>(_args._Ksections, 0);
+        }
+
+        /* Make sure we've been set up correctly. */
+        assert(_B_transposed);
+        static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+//        static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
+
+        /* For now, each work item implies all the K for a given output
+         * pixel (so we don't need to synchronize access to the output
+         * array).  So separate the loop over K blocks here.  */
+        for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
+            unsigned int kmax   = std::min(k0 + _k_block, _Ktotal);
+            unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
+
+            const bool first_pass = (k0 == 0);
+            const bool last_pass = (kmax == _Ktotal);
+
+            unsigned int first_section = (k0 / _rounded_Ksize);
+            unsigned int first_offset  = (k0 % _rounded_Ksize);
+            unsigned int kleft = kern_k;
+            unsigned int sections=0;
+            unsigned int offset = first_offset;
+
+            if (_args._indirect_input) {
+                while (kleft) {
+                    // When chopping into sections: the amount that goes into 'string_lengths' is the amount to be
+                    // processed (excluding padding).  But the amount we subtract from 'kleft' takes account of any
+                    // padding applied.
+                    string_lengths[sections] = std::min(kleft, _args._Ksize - offset);
+                    kleft -= std::min(kleft, _rounded_Ksize - offset);
+                    sections++;
+                    offset=0;
+                }
+            }
+
+            auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
+
+            if (p.done()) {
+                return;
+            }
+
+            // Process rows either 'out_height' rows at a time, or do all valid rows at once with a single kernel call.
+            // The separate quantizer path only handles one block of rows at a time (as it has to store sums and intermediate results).
+            // THe convolution path only generates the pointers for one block of rows at a time.
+            const bool process_all_rows = (!SeparateQuantize && !_convolver);
+
+            do {
+                const unsigned int m_start = p.dim(0) * strategy::out_height();
+                const unsigned int m_end   = process_all_rows ? std::min(p.dim0_max() * strategy::out_height(), _args._Msize) : std::min(m_start + strategy::out_height(), _args._Msize);
+//                const unsigned int m_end   = std::min(m_start + strategy::out_height(), _args._Msize);
+                const unsigned int batch   = p.dim(1);
+                const unsigned int n0      = p.dim(2) * _n_block;
+                const unsigned int nmax    = std::min(n0 + _n_block, _args._Nsize);
+                const unsigned int multi   = p.dim(3);
+
+                const Toi *b_panel = _B_transposed +
+                                     (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
+                                     (k0 * roundup(_args._Nsize, strategy::out_width())) +
+                                     (n0 * kern_k);
+
+               IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
+
+#ifdef CYCLE_PROFILING
+                auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
+#endif
+                if (_indirect_buf) {
+                    run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+#ifdef CYCLE_PROFILING
+                                 prof,
+#endif
+                                 strat, sections, string_lengths.data(),
+                                 IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset),
+                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+                                 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
+                                 last_pass ? _args._act : Activation(),
+                                 !first_pass,
+                                 // Quantization parameters
+                                 _os, _col_bias+(multi * _args._Nsize), n0);
+                } else if (_convolver) {
+                    auto conv_cols = _convolver->process_columns(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride), this->_lda, k0, kmax, _rounded_Ksize);
+
+                    unsigned int pos=0;
+                    auto conv_rows = conv_cols.process_rows(m_start, m_end - m_start);
+
+                    while (!conv_rows.finished()) {
+                        unsigned int width, conv_offset;
+
+                        assert(pos < sections);
+
+                        std::tie(width, conv_offset) = conv_rows.next_block(&(in_row_ptrs[pos * strategy::out_height()]));
+
+                        if (pos==0) {
+                            assert(conv_offset == first_offset);
+                        }
+                        assert(width == string_lengths[pos]);
+                        pos++;
+                    }
+                    assert(pos == sections);
+
+                    run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+#ifdef CYCLE_PROFILING
+                                 prof,
+#endif
+                                 strat, sections, string_lengths.data(),
+                                 IndirectInputArg<To>(in_row_strings.data(), 0, first_offset),
+                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+                                 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
+                                 last_pass ? _args._act : Activation(),
+                                 !first_pass,
+                                 // Quantization parameters
+                                 _os, _col_bias+(multi * _args._Nsize), n0);
+                } else {
+                    // Length to process.  This needs to exclude padding, but 'kmax' potentially includes it.
+                    const unsigned int len = (std::min(_args._Ksize, kmax) - k0);
+
+                    run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+#ifdef CYCLE_PROFILING
+                                 prof,
+#endif
+                                 strat, 1, &len,
+                                 IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
+                                 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+                                 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
+                                 last_pass ? _args._act : Activation(),
+                                 !first_pass,
+                                 // Quantization parameters
+                                 _os, _col_bias+(multi * _args._Nsize), n0);
+                }
+            } while (process_all_rows ? p.next_dim1() : p.next_dim0());
+        }
+    }
+
+    // Interface implementation - pretransposed
+    bool B_is_pretransposed() const override {
+        return true;
+    }
+
+    bool B_pretranspose_required() const override {
+        return (_B_transposed==nullptr);
+    }
+
+    size_t get_B_pretransposed_array_size() const override {
+        // Start with actual pretransposed buffer...
+        size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi);
+
+        // Space for result row pointers (not strictly needed any more but retained for indirect output testing)
+        size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *);
+
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            size += get_col_sum_size();
+        }
+
+        return size;
+    }
+
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            _col_bias = reinterpret_cast<int32_t *>(in_buffer);
+
+            Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
+
+            for (unsigned int i=0; i<_args._nmulti; i++) {
+                // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
+                compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize * _args._Ksections, B + (i * B_multi_stride), ldb, _col_bias + (i * _args._Nsize), _args._Ksize * _args._Ksections, i, 0);
+            }
+        }
+
+        // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+        Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        _B_transposed = buffer;
+
+        strategy strat(_args._ci);
+
+        for (unsigned int multi=0; multi<_args._nmulti; multi++) {
+            for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
+                const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
+
+                /* Figure out the size of each block. */
+                unsigned int k_size = kmax - k0;
+
+                // We need to insert padding at the end of each K section.
+                // The computation needed is a little delicate - the coordinates from the block walker are expressed in
+                // terms of the full, padded, _Ktotal.
+                // But we need to transform each section with reference to the original, unpadded, input, letting the
+                // transform pad each section as needed.
+
+                // This is needed for computations below.
+                const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
+
+                // The expected output format is also an entire <out_width> columns interleaved, then the next set of
+                // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
+                // a time.
+                for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
+                    unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
+
+                    // Track where we are and how much work is left.
+                    unsigned int kpos  = k0;
+                    unsigned int kleft = k_size;
+
+                    while (kleft) {
+                        // Which section are we in?  Based on the rounded-up section size.
+                        unsigned int k_section_base = kpos / rounded_section_size;
+                        // How far into the section are we?
+                        unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
+
+                        // We will either copy the rest of this section, or to the end of the requested length.
+                        unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
+
+                        strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
+                                                  x0, xmax,
+                                                  (k_section_base * _args._Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
+                                                  (k_section_base * _args._Ksize) + k_offset + k_length);   // K end point - starting point plus length computed above.
+
+                        // We need to modify our position based on the ROUNDED version of what we just did.
+                        unsigned int padded_length = roundup(k_length, strategy::k_unroll());
+
+                        buffer += strategy::out_width() * padded_length;
+
+                        kpos  += padded_length;
+                        kleft -= padded_length;
+                    }
+                }
+            }
+        }
+    }
+
+    void set_pretransposed_B_data(void *in_buffer) override {
+        // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+        _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        _col_bias = reinterpret_cast<int32_t *>(in_buffer);
+    }
+
+    // Estimate cycles for given problem given provided parameters
+    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+        // Note: Current hybrid kernels don't actually round up height (they
+        // have paths for each possible height).  Might need to make this
+        // configurable in future.
+        uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+
+        float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+
+        // TODO: A bit of a kludge here: current hybrid kernels incur extra
+        // overhead where the width is not a multiple of kernel width.  It's
+        // most noticable where the overall width is quite low, so add 15%
+        // penalty for such widths.
+        if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) {
+            mac_cycles *= 1.15f;
+        }
+
+        uint64_t total_cycles = mac_cycles;
+
+        return total_cycles;
+    }
+
+    void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
+
+            qp->bias = bias;
+            qp->bias_multi_stride = bias_multi_stride;
+        }
+    }
+
+    void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
+        assert(string_len == _args._Ksize);
+        _indirect_buf = ptr;
+    }
+
+    void set_convolution_parameters(ConvolutionParameters parms) override {
+        assert(parms.input_channels == _args._Ksize);
+        _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
+    }
+};
+
+} // namespace arm_gemm
+
+#ifdef __I_DEFINED_UNUSED
+#undef UNUSED
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
index 915227fc29..7a5fa87ee6 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
@@ -118,18 +118,27 @@ class GemmHybridQuantized : public GemmCommon<To, Tr> {
 
         // n_block: Work out how many rows (of length k_block) will fit in the L2
         // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-        unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
-                                 (sizeof(Toi) * k_block);
+        const unsigned int scaled_l2_size = (L2_size * 9) / 10;
+        const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height());
+
+        // .. if the L1 contents is bigger than the L2, just return a minimal size block.
+        if (k_block_area > scaled_l2_size) {
+            return strategy::out_width();
+        }
+
+        unsigned int n_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block);
 
         // Needs to be (at least a single) multiple of the kernel output width.
         n_block /= strategy::out_width();
-        n_block = std::max(n_block, 1U) * strategy::out_width();
+        n_block = std::max(n_block, 1u) * strategy::out_width();
 
         // And tune to the presented problem size.
         unsigned int numblocks = iceildiv(args._Nsize, n_block);
         n_block = iceildiv(args._Nsize, numblocks);
         n_block = roundup(n_block, strategy::out_width());
 
+        assert(n_block > 0);
+
         return n_block;
     }
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
new file mode 100644
index 0000000000..7376b5ffe3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2017-2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <assert.h>
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+#include "ndrange.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_gemm {
+
+// Implementation of the GemmCommon abstract class.
+template<typename strategy, typename To, typename Tr>
+class GemmHybridQuantizedInline : public GemmCommon<To, Tr> {
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type Tri;
+
+    /* const properties set by constructor */
+    const CPUInfo * const _ci;
+
+    const unsigned int _Msize;
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+
+    const unsigned int _nbatches;
+    const unsigned int _nmulti;
+
+    /* Blocking info */
+    const unsigned int _k_block;
+    const unsigned int _n_block;
+    const unsigned int _Mround;
+
+    /* Pretransposed buffer. */
+    const Toi *_B_transposed=nullptr;
+
+    const NDRange<4> _window_range;
+
+    Requantize32  _qp;
+    int32_t *col_bias = nullptr;
+
+    void *working_space = nullptr;
+
+    unsigned int _nthreads;
+
+    unsigned int get_col_sum_size() const {
+        return _Nsize * _nmulti * sizeof(int32_t);
+    }
+
+    static unsigned int compute_k_block(const GemmArgs &args) {
+        // We don't support K blocks as we only temporarily store 32 bit results.
+        return args._Ksize;
+
+        if (args._cfg && args._cfg->inner_block_size) {
+            return args._cfg->inner_block_size;
+        }
+
+        const unsigned int L1_size = args._ci->get_L1_cache_size();
+
+        // k_block: Find out how much of the larger array can be loaded into half the cache.
+        // This should account for associative caches.
+        unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+        // Needs to be (at least a single) multiple of the K unroll level.
+        k_block /= strategy::k_unroll();
+        k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+
+        // So divide the space equally into that many blocks.
+        k_block = iceildiv(args._Ksize, numk_blocks);
+
+        // And round UP to the K unroll level required.
+        k_block = roundup(k_block, strategy::k_unroll());
+
+        return k_block;
+    }
+
+    static unsigned int compute_n_block(const GemmArgs &args) {
+        if (args._cfg && args._cfg->outer_block_size) {
+            return args._cfg->outer_block_size;
+        }
+
+        const unsigned int k_block = compute_k_block(args);
+        const unsigned int L2_size = args._ci->get_L2_cache_size();
+
+        // n_block: Work out how many rows (of length k_block) will fit in the L2
+        // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+        unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+                                 (sizeof(Toi) * k_block);
+
+        // Needs to be (at least a single) multiple of the kernel output width.
+        n_block /= strategy::out_width();
+        n_block = std::max(n_block, 1U) * strategy::out_width();
+
+        // And tune to the presented problem size.
+        unsigned int numblocks = iceildiv(args._Nsize, n_block);
+        n_block = iceildiv(args._Nsize, numblocks);
+        n_block = roundup(n_block, strategy::out_width());
+
+        return n_block;
+    }
+
+public:
+    GemmHybridQuantizedInline(GemmHybridQuantizedInline &) = delete;
+    GemmHybridQuantizedInline & operator= (GemmHybridQuantizedInline &) = delete;
+
+    /* Constructor */
+    GemmHybridQuantizedInline(const GemmArgs &args, const Requantize32 &qp)
+              : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+                _nbatches(args._nbatches), _nmulti(args._nmulti),
+                _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
+                _Mround(roundup(args._Msize, strategy::out_height())),
+                _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti),
+                _qp (qp), _nthreads(args._maxthreads) { }
+
+    // Interface implementation - Compulsory functions
+    ndrange_t get_window_size() const override {
+        return { _window_range.total_size() };
+    }
+
+    // This kernel can always be dynamically scheduled.
+    bool supports_dynamic_scheduling() const override {
+        return true;
+    }
+
+    // Execute
+    void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
+#ifdef CYCLE_PROFILING
+        profiler prof;
+#endif
+        strategy strat(_ci);
+
+        /* Make sure we've been set up correctly. */
+        assert(_B_transposed);
+        static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+
+        /* For now, each work item implies all the K for a given output
+         * pixel (so we don't need to synchronize access to the output
+         * array).  So separate the loop over K blocks here.  */
+        for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
+            unsigned int kmax   = std::min(k0 + _k_block, _Ksize);
+            unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
+
+            auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
+
+            if (p.done()) {
+                return;
+            }
+
+            do {
+                const unsigned int m_start = p.dim(0) * strategy::out_height();
+                const unsigned int m_end   = std::min(p.dim0_max() * strategy::out_height(), _Msize);
+                const unsigned int batch   = p.dim(1);
+                const unsigned int n0      = p.dim(2) * _n_block;
+                const unsigned int nmax    = std::min(n0 + _n_block, _Nsize);
+                const unsigned int multi   = p.dim(3);
+
+                const Toi *b_panel = _B_transposed +
+                                     (multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) +
+                                     (k0 * roundup(_Nsize, strategy::out_width())) +
+                                     (n0 * kern_k);
+
+                {
+#ifdef CYCLE_PROFILING
+                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
+#endif
+                    strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,
+                                 b_panel,
+                                 this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
+                                 (m_end - m_start), (nmax - n0), kmax - k0,
+                                 col_bias + (multi * _Nsize) + n0, _qp);
+                }
+            } while (p.next_dim1());
+        }
+    }
+
+    // Interface implementation - pretransposed
+    bool B_is_pretransposed() const override {
+        return true;
+    }
+
+    bool B_pretranspose_required() const override {
+        return (_B_transposed==nullptr);
+    }
+
+    size_t get_B_pretransposed_array_size() const override {
+        return get_col_sum_size() + (roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi));
+    }
+
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+        col_bias = reinterpret_cast<int32_t *>(in_buffer);
+
+        for (unsigned int i=0; i<_nmulti; i++) {
+            compute_col_sums(_qp, _Nsize, _Ksize, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize),  _Ksize, i, 0);
+        }
+
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+        Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        _B_transposed = buffer;
+        strategy strat(_ci);
+
+        for (unsigned int multi=0; multi<_nmulti; multi++) {
+            for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
+                const unsigned int kmax = std::min(k0 + _k_block, _Ksize);
+                const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll());
+
+                for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) {
+                    const unsigned int xmax = std::min(x0+_n_block, _Nsize);
+
+                    const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;
+
+                    strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
+                                               x0, xmax, k0, kmax);
+
+                    buffer += size;
+                }
+            }
+        }
+    }
+
+    void set_pretransposed_B_data(void *in_buffer) override {
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+        _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        col_bias = reinterpret_cast<int32_t *>(in_buffer);
+    }
+
+    void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
+        _qp.bias = bias;
+        _qp.bias_multi_stride = bias_multi_stride;
+    }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index 261e7d2d9c..f6a0fc5d52 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -37,9 +37,9 @@ template<typename Top, typename Tret, class OutputStage = Nothing>
 struct GemmImplementation {
     const GemmMethod                                                               method;
     const char *                                                                   name;
-    std::function<bool(const GemmArgs &, const OutputStage &)>                     is_supported;
-    std::function<uint64_t(const GemmArgs &, const OutputStage &)>                 cycle_estimate;
-    std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)>  instantiate;
+    std::function<bool(const GemmArgs &, const OutputStage &)>                     is_supported = {};
+    std::function<uint64_t(const GemmArgs &, const OutputStage &)>                 cycle_estimate = {};
+    std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)>  instantiate = {};
 
     bool do_is_supported(const GemmArgs &args, const OutputStage &os) const {
         if (is_supported != nullptr) {
@@ -57,13 +57,13 @@ struct GemmImplementation {
         }
     }
 
-    GemmImplementation(const GemmImplementation &) = default;
-    GemmImplementation &operator= (const GemmImplementation &) = default;
-
     GemmCommon<Top, Tret> *do_instantiate(const GemmArgs &args, const OutputStage &os) const {
         return instantiate(args, os);
     }
 
+    GemmImplementation(const GemmImplementation &) = default;
+    GemmImplementation & operator= (const GemmImplementation &) = default;
+
     GemmImplementation(GemmMethod m, const char *n,
                        std::function<bool(const GemmArgs &, const OutputStage &)> is_supported, std::function<bool(const GemmArgs &, const OutputStage &)> is_recommended,
                        std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) :
@@ -79,9 +79,9 @@ template<typename Top, typename Tret>
 struct GemmImplementation<Top, Tret, Nothing> {
     const GemmMethod                                          method;
     const char *                                              name;
-    std::function<bool(const GemmArgs &)>                     is_supported;
-    std::function<uint64_t(const GemmArgs &)>                 cycle_estimate;
-    std::function<GemmCommon<Top, Tret> *(const GemmArgs &)>  instantiate;
+    std::function<bool(const GemmArgs &)>                     is_supported = {};
+    std::function<uint64_t(const GemmArgs &)>                 cycle_estimate = {};
+    std::function<GemmCommon<Top, Tret> *(const GemmArgs &)>  instantiate = {};
 
     bool do_is_supported(const GemmArgs &args, const Nothing &) const {
         if (is_supported != nullptr) {
@@ -103,7 +103,6 @@ struct GemmImplementation<Top, Tret, Nothing> {
         return instantiate(args);
     }
 
-
     static GemmImplementation with_estimate(GemmMethod m, const char *n,
                        std::function<bool(const GemmArgs &)> is_supported, std::function<uint64_t(const GemmArgs &)> cycle_estimate,
                        std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) {
@@ -116,7 +115,10 @@ struct GemmImplementation<Top, Tret, Nothing> {
         return impl;
     }
 
-    GemmImplementation(GemmMethod m, const char * n) : method(m), name(n), is_supported(nullptr), cycle_estimate(nullptr), instantiate(nullptr) {}
+    GemmImplementation(const GemmImplementation &) = default;
+    GemmImplementation & operator= (const GemmImplementation &) = default;
+
+    GemmImplementation(GemmMethod m, const char * n) : method(m), name(n) {}
 
     GemmImplementation(GemmMethod m, const char *n,
                        std::function<bool(const GemmArgs &)> is_supported, std::function<bool(const GemmArgs &)> is_recommended,
@@ -124,9 +126,6 @@ struct GemmImplementation<Top, Tret, Nothing> {
                        method(m), name(n), is_supported(is_supported),
                        cycle_estimate( [is_recommended](const GemmArgs &args) -> uint64_t { return (is_recommended == nullptr) ? 0 : (is_recommended(args) ? 0 : UINT64_MAX); } ),
                        instantiate(instantiate) {   }
-
-    GemmImplementation(const GemmImplementation &) = default;
-    GemmImplementation &operator=(const GemmImplementation &) = default;
 };
 
 /* "Master" function implemented for each valid combination of types.
@@ -211,6 +210,7 @@ std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, cons
 
     for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
         /* Check that this implementation supports the presented problem. */
+
         if (!i->do_is_supported(args, os)) {
             continue;
         }
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index da682330a0..a3a61959c3 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -28,17 +28,17 @@
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
 
-#include "kernels/a64_gemm_s16_12x8.hpp"
+#include "kernels/a64_gemm_s16_8x12.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<int16_t, int32_t> gemm_s16_methods[] = {
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_s16_12x8",
+    "a64_gemm_s16_8x12",
     nullptr,
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s16_8x12, int16_t, int32_t>(args); }
 },
 {
     GemmMethod::DEFAULT,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index bddcc8dab1..31f225002e 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -26,21 +26,22 @@
 #include "arm_gemm.hpp"
 #include "gemm_common.hpp"
 #include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
-#include "gemm_interleaved_pretransposed_2d.hpp"
 
-#include "kernels/a64_gemm_s16_12x8.hpp"
-#include "kernels/a64_gemm_s8_12x8.hpp"
+#include "kernels/a64_gemm_s16_8x12.hpp"
+#include "kernels/a64_gemm_s8_8x12.hpp"
 #include "kernels/a64_gemm_s8_4x4.hpp"
-#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
-#include "kernels/a64_interleaved_s8s32_mmla_12x8.hpp"
-#include "kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp"
-#include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp"
-#include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
-#include "kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp"
-#include "kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp"
+#include "kernels/a64_hybrid_s8s32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp"
+
+#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp"
+#include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp"
 
 namespace arm_gemm {
 
@@ -49,98 +50,84 @@ static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
 #ifdef MMLA_INT8
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_s8s32_mmla_3VLx8",
+    "sve_interleaved_s8s32_mmla_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>8); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_s8s32_mmla_3VLx8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int32_t>(args); }
 },
 #endif
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_s8s32_dot_1VLx8",
-    [](const GemmArgs &args) { return args._Ksize<=64; },
+    "sve_smallK_hybrid_s8s32_dot_8x1VL",
+    [](const GemmArgs &args) { return args._Ksize<=64 && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_s8s32_dot_1VLx8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_s8s32_dot_8x1VL, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_s8s32_dot_4VLx4",
+    "sve_hybrid_s8s32_dot_6x4VL",
     [](const GemmArgs &args) { return args._Ksize>=16; },
     [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_s8s32_dot_4VLx4, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_s8s32_dot_3VLx8",
+    "sve_interleaved_s8s32_dot_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int32_t>(args); }
 },
-#endif
+#endif // SVE
 #ifdef MMLA_INT8
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_s8s32_mmla_12x8",
+    "a64_interleaved_s8s32_mmla_8x12",
     [](const GemmArgs &args) { return (args._Ksize>8); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_s8s32_mmla_12x8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int32_t>(args); }
 },
 #endif
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_s8s32_dot_4x8",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
+    "a64_smallK_hybrid_s8s32_dot_8x4",
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_s8s32_dot_4x8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_s8s32_dot_8x4, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_s8s32_dot_4x6",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
+    "a64_smallK_hybrid_s8s32_dot_6x4",
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_s8s32_dot_4x6, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_s8s32_dot_6x4, int8_t, int32_t>(args); }
 },
 {
-    GemmMethod::GEMM_HYBRID,
-    "hybrid_s8s32_dot_16x4",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16; },
-    [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_s8s32_dot_16x4, int8_t, int32_t>(args); }
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_s16_8x12",
+    nullptr,
+    [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Ksize>4; },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s16_8x12, int8_t, int32_t>(args); },
 },
 {
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "gemm_s8_12x8_2d",
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8s32_dot_6x16",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8); },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_s8_12x8, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_s8_12x8_1d",
+    "a64_gemm_s8_8x12",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(args); }
-},
-{
-    GemmMethod::GEMM_INTERLEAVED,
-    "gemm_s16_12x8",
-    nullptr,
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53; },
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_s16_12x8, int8_t, int32_t>(args); },
-},
-{
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "gemm_s8_4x4_2d",
-    nullptr,
-    [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8); },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_s8_4x4, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_8x12, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_s8_4x4_1d",
+    "a64_gemm_s8_4x4",
     nullptr,
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_4x4, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::DEFAULT,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index c4dceef922..92c1086a5f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -27,11 +27,12 @@
 #include <cassert>
 
 #include "arm_gemm.hpp"
-#include "utils.hpp"
-
+#include "convolver.hpp"
 #include "mergeresults.hpp"
 #include "performance_parameters.hpp"
+#include "quantized.hpp"
 #include "transform.hpp"
+#include "utils.hpp"
 
 #ifdef CYCLE_PROFILING
 #include "profiler.hpp"
@@ -46,12 +47,212 @@
 //
 // This implementation interleaves the source matrices in blocks - good for
 // larger matrices.
+
 namespace arm_gemm {
 
-template<typename strategy, typename To, typename Tr>
+namespace {
+
+// Some kernels output to a linear buffer and require a separate merge step.
+// Others output directly to the matrix result.  This helper class calls the
+// appropriate functions, using templating to avoid calling non-existent
+// functions.
+template<bool MergeStep, typename OutputStage>
+class kernel_and_merge {
+public:
+    template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+    static void run (
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+        unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
+        const Activation &act, bool accumulate, const OutputStage &os, const int32_t *col_bias,
+        Tab *acc_buff);
+};
+
+// Run a kernel and call the separate merge step
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<true, Nothing>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+        unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
+        const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
+{
+    const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+#endif
+
+        strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+    }
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
+#endif
+        strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act, accumulate);
+    }
+}
+
+// Run a kernel with integrated merge
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<false, Nothing>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
+        unsigned int n_0, unsigned int n_max, const Tr *biasptr,
+        const Activation &act, bool accumulate, const Nothing &, const int32_t *,
+        Tab *acc_buff)
+{
+#ifdef CYCLE_PROFILING
+    auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
+#endif
+
+    // We need to offset the C pointer, but as it might be NULL (requesting output to accumulation buffer) we need
+    // to be careful not to offset a null pointer.
+    Tri *offset_c_ptr;
+
+    if (c_ptr == nullptr) {
+        offset_c_ptr = nullptr;
+    } else {
+        offset_c_ptr = c_ptr + m_0 * ldc + n_0;
+    }
+
+    strat.kernel(// A and B pointers are just the packed panels.
+                 a_ptr, b_panel,
+                 // Provide relevant part of output array and row stride.
+                 offset_c_ptr, ldc,
+                 // M, N, K sizes
+                 m_max-m_0, n_max - n_0, kern_k,
+                 // Bias, activation, accumulation.  Need to offset the bias as needed.
+                 biasptr ? biasptr + n_0 : nullptr, act, accumulate,
+                 // Accumulation buffer.
+                 acc_buff );
+}
+
+// Run a kernel with integrated merge, quantizing
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<false, Requantize32>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
+        unsigned int n_0, unsigned int n_max, const Tr *,
+        const Activation &, bool accumulate, const Requantize32 &qp, const int32_t *col_bias,
+        Tab *acc_buff)
+{
+#ifdef CYCLE_PROFILING
+    auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
+#endif
+
+    strat.kernel(// A and B pointers are just the packed panels.
+                 a_ptr, b_panel,
+                 // Provide relevant part of output array and row stride.
+                 c_ptr + m_0 * ldc + n_0, ldc,
+                 // M, N, K sizes
+                 m_max-m_0, n_max - n_0, kern_k,
+                 // Bias, activation, accumulation.  Need to offset the bias as needed.
+                 col_bias + n_0, qp, n_0, accumulate, acc_buff);
+}
+
+// Run a kernel and call the separate quantize step
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<true, Requantize32>::run(
+#ifdef CYCLE_PROFILING
+        profiler &prof,
+#endif
+        strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+        Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+        unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *,
+        const Activation &, bool, const Requantize32 &qp, const int32_t *col_bias,
+        Tab *)
+{
+    const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+#endif
+
+        strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+    }
+
+    {
+#ifdef CYCLE_PROFILING
+        auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
+#endif
+        // The interleaved kernel outputs in blocks - each block is a
+        // row-major matrix of size out_width * out_height.  The merge
+        // kernels are designed to deal with this but the requantizer is
+        // not, so we need to requantize one block at a time.
+        for (int i=0; i<bblocks; i++) {
+            unsigned int n_start = n_0 + (strategy::out_width() * i);
+            unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
+
+            // The row bias is interleaved with the transposed A data, get a pointer to it here.
+            const int32_t *row_bias = reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k);
+
+            requantize_block_32(qp, (n_end - n_start), (m_max-m_0),
+                                c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(),
+                                c_ptr + m_0 * ldc + n_start, ldc,
+                                row_bias, col_bias + n_start, n_start);
+        }
+    }
+}
+
+// Integer GEMMs can be used in two contexts - "normal" where the full 32-bit output is required, or in
+// "requantizing" context where the output will be requantized.
+//
+// These require different input transforms, as if we are requantizing we want to sum the rows of the A input, and
+// if we are not we don't.
+//
+// This helper class allows the appropriate transforms to be found, without requiring kernels that don't support
+// quantization to define useless "quantized" transforms.
+template<typename strategy, bool quantized>
+class transform_type {
+public:
+    typedef decltype(strategy::transforms) type;
+};
+
+template<typename strategy>
+class transform_type<strategy, true> {
+public:
+    typedef decltype(strategy::transforms_quantized) type;
+};
+
+// We need a similar trick here to figure out what type the accumulator buffer should be.
+template<typename strategy, typename OutputStage>
+class accumulate_buffer_type {
+public:
+    typedef typename strategy::result_type type;
+};
+
+template<typename strategy>
+class accumulate_buffer_type<strategy, Requantize32> {
+public:
+    typedef int32_t type;
+};
+
+} // anonymous namespace
+
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool ForceThreadColumns=false>
 class GemmInterleaved : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
     typedef typename strategy::result_type Tri;
+    typedef typename accumulate_buffer_type<strategy, OutputStage>::type Tab;
 
     /* const properties set by constructor */
     const CPUInfo * const _ci;
@@ -59,10 +260,15 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
     const unsigned int _Msize;
     const unsigned int _Nsize;
     const unsigned int _Ksize;
+    const unsigned int _Ksections;
+    const unsigned int _Ktotal;
+    const unsigned int _rounded_Ksize;
 
     const unsigned int _nbatches;
     const unsigned int _nmulti;
 
+    const bool _thread_columns;
+
     const Activation _act;
 
     const int _maxthreads;
@@ -77,30 +283,59 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
     const Toi *_B_transposed=nullptr;
     void *_working_space=nullptr;
 
+    Tab *_accumulation_buffer=nullptr;
+
+    /* Output stage */
+    OutputStage  _os;
+
+    /* Quantized support (in addition to 'output stage' above */
+    int32_t *col_bias = nullptr;
+
+    /* Indirect parameters.  _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
+    const To * const * const * _indirect_buf = nullptr;
+
+    /* Convolver - only set up for convolution problems, so also doubles as a flag. */
+    std::unique_ptr<convolver<To>>  _convolver = nullptr;
+
+    unsigned int get_col_sum_size() const {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            return _Nsize * _nmulti * sizeof(int32_t);
+        } else {
+            return 0;
+        }
+    }
+
     /* We will need to walk through the blocks of B in a few contexts, so
      * factor that out.  */
     class blockwalker {
     private:
         /* Size loops, etc. based on our parent's configuration */
-        const GemmInterleaved<strategy, To, Tr> &_parent;
+        const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &_parent;
 
         /* K, X and multi parameters for current iteration. */
         unsigned int _k0=0, _x0=0, _multi=0;
 
+        /* Range of X to iterate over - used in "ForceThreadColumns" cases */
+        unsigned int _x_start=0;
+        unsigned int _x_end=_parent._Nsize;
+
         unsigned int _index=0;
         bool _done=false;
         bool _newkblock=true;
         bool _newmulti=true;
 
     public:
-        blockwalker(const GemmInterleaved<strategy, To, Tr> &parent) : _parent(parent) { }
+        blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent) : _parent(parent) { }
+
+        blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent,
+                    unsigned int x_start, unsigned int x_end) : _parent(parent), _x0 (_x_start), _x_start(x_start), _x_end(x_end) { }
 
         unsigned int xmax() {
-            return std::min(_x0 + _parent._x_block, _parent._Nsize);
+            return std::min(_x0 + _parent._x_block, _x_end);
         }
 
         unsigned int kmax() {
-            return std::min(_k0 + _parent._k_block, _parent._Ksize);
+            return std::min(_k0 + _parent._k_block, _parent._Ktotal);
         }
 
         /* Advance to the next block, return false at the end. */
@@ -111,10 +346,10 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
 
             _newkblock=false;
             _x0 += _parent._x_block;
-            if (_x0 >= _parent._Nsize) {
-                _x0=0;
+            if (_x0 >= _x_end) {
+                _x0=_x_start;
                 _k0 += _parent._k_block;
-                if (_k0 >= _parent._Ksize) {
+                if (_k0 >= _parent._Ktotal) {
                     _k0=0;
                     _multi++;
                     if (_multi >= _parent._nmulti) {
@@ -138,14 +373,125 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
         bool newkblock(void) { return _newkblock; }
     };
 
-    // A working size: One of these needed, regardless of thread count.  Divided according to window.
+    // "k block" has two distinct uses: figuring out which iterations of K
+    // to actually process, but also various size/pointer computations.  The
+    // latter needs to take account of the extra space needed for the row
+    // sums, if appropriate.
+    unsigned int get_total_k_depth() const {
+        unsigned int k_depth = _k_block;
+
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            k_depth += sizeof(int32_t) / sizeof(Toi);
+        }
+
+        return k_depth;
+    }
+
+    // A working size.
     size_t get_a_working_size() const {
-        return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches);
+        if (_thread_columns) {
+            // For 2D threading: allocate a buffer of one block of rows per thread
+            return ROUND_UP(sizeof(Toi) * get_total_k_depth() * strategy::out_height() * _maxthreads);
+        } else {
+            // For 1D threaded: one of these needed, regardless of thread count.  Divided according to window.
+            return ROUND_UP(sizeof(Toi) * get_total_k_depth() * _Mround * _nbatches);
+        }
     }
 
-    // C working size: One needed per thread.
+    // C working size: One needed per thread.  Not needed if there is no merge step.
     size_t get_c_working_size() const {
-        return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
+        if (MergeStep) {
+            return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
+        } else {
+            return 0;
+        }
+    }
+
+    // Accumulation buffer size
+    size_t get_accumulation_buffer_size() const {
+        // We only support an accumulation buffer for non-merge cases.
+        if (MergeStep) {
+            return 0;
+        }
+
+        // Check if we are actually blocking
+        if (_k_block == _Ktotal) {
+            return 0;
+        }
+
+        // We are no-merge, non-quantized with active blocking: accumulation buffer needed.
+        size_t size_per_buffer = sizeof(Tab) * strategy::out_height() * strategy::out_width();
+        size_t num_buffers = iceildiv(_Msize, strategy::out_height()) * iceildiv(_Nsize, strategy::out_width()) * _nbatches * _nmulti;
+
+        return num_buffers * size_per_buffer;
+    }
+
+    // Get pointer into accumulation buffer
+    Tab *get_accumulation_buffer(unsigned int M, unsigned int N, unsigned int batch, unsigned int multi) const {
+        // Don't do anything if there's no buffer.
+        if (_accumulation_buffer == nullptr) {
+            return nullptr;
+        }
+
+        // Here we are indexing an appropriately sized pointer, so no sizeof() needed to convert to bytes.
+        size_t size_per_buffer = strategy::out_height() * strategy::out_width();
+
+        size_t buffer_rows = iceildiv(_Msize, strategy::out_height());
+        size_t buffer_cols = iceildiv(_Nsize, strategy::out_width());
+        size_t buffers_per_batch = (buffer_rows * buffer_cols);
+        size_t buffers_per_multi = buffers_per_batch * _nbatches;
+
+        // M/N must reference the top-left corner of a block.
+        size_t row = M / strategy::out_height();
+        assert(M % strategy::out_height() == 0);
+        size_t col = N / strategy::out_width();
+        assert(N % strategy::out_width() == 0);
+
+        size_t buffer_index = multi * buffers_per_multi + batch * buffers_per_batch + row * buffer_cols + col;
+
+        return _accumulation_buffer + (buffer_index * size_per_buffer);
+    }
+
+    int32_t row_sum_multiplier() const {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&_os);
+
+            return -qp->b_offset;
+        }
+
+        return 0;
+    }
+
+    // Heuristics to decide whether to use the 'thread columns' regime
+    static bool is_thread_columns(const GemmArgs &args) {
+        // For now, there is a templace parameter to force it.
+        if (ForceThreadColumns) {
+            return true;
+        }
+
+        // Never do this for single threaded cases.
+        if (args._maxthreads == 1) {
+            return false;
+        }
+
+        // How many blocks of work are available for threading on M?
+        int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches;
+
+        // If we just can't share the work across threads with the row threading regime.
+        if (args._maxthreads > m_blocks) {
+            return true;
+        }
+
+        // If the row threading regime is too wasteful (20% threshold)
+        if (((roundup(m_blocks, args._maxthreads) * 100) / m_blocks) > 120) {
+            return true;
+        }
+
+        return false;
+    }
+
+    static unsigned int get_ktotal(const GemmArgs &args) {
+        return args._Ksections * roundup(args._Ksize, strategy::k_unroll());
     }
 
     static unsigned int get_k_block_size(const GemmArgs &args) {
@@ -153,6 +499,11 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
             return args._cfg->inner_block_size;
         }
 
+        // K blocking not supported if we are requantizing.
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            return get_ktotal(args);
+        }
+
         const unsigned int L1_size = args._ci->get_L1_cache_size();
         unsigned int k_block;
 
@@ -165,58 +516,84 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
         k_block = std::max(k_block, 1U) * strategy::k_unroll();
 
         // Now tune to presented problem size; this is how many blocks we need.
-        unsigned int num_k_blocks = iceildiv(args._Ksize, k_block);
+        unsigned int num_k_blocks = iceildiv(get_ktotal(args), k_block);
 
         // So divide the space equally into that many blocks.
-        k_block = iceildiv(args._Ksize, num_k_blocks);
+        k_block = iceildiv(get_ktotal(args), num_k_blocks);
 
         // And round UP to the K unroll level required.
         k_block = roundup(k_block, strategy::k_unroll());
 
+        assert(k_block > 0);
+
         return k_block;
     }
 
-public:
-    GemmInterleaved(GemmInterleaved &) = delete;
-    GemmInterleaved & operator= (GemmInterleaved &) = delete;
-
-    /* Constructor */
-    GemmInterleaved(const GemmArgs &args)
-                    : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
-                      _nbatches(args._nbatches), _nmulti(args._nmulti),
-                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
-                      _k_block(get_k_block_size(args)) {
-        const unsigned int L2_size = _ci->get_L2_cache_size();
-
-        assert(_maxthreads > 0);
+    static unsigned int get_x_block_size(const GemmArgs &args) {
+        if (is_thread_columns(args)) {
+            // In 2D mode, override X block, because we will process width first.
+            return roundup(args._Nsize, strategy::out_width());
+        }
 
-        // Work out blocking parameters, or override from provided GemmConfig
-        // TODO: Move outer block into a static function too.
         if (args._cfg && args._cfg->outer_block_size) {
-            _x_block = args._cfg->outer_block_size;
-        } else {
-            // x_block: Work out how many rows (of length k_block) will fit in the L2
-            // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-            _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
-                      (sizeof(Toi) * _k_block);
+            return roundup(args._cfg->outer_block_size, strategy::out_width());
+        }
 
-            // Needs to be (at least a single) multiple of the kernel output width.
-            _x_block /= strategy::out_width();
-            _x_block = std::max(_x_block, 1U) * strategy::out_width();
+        unsigned int x_block;
+        const unsigned int L2_size = args._ci->get_L2_cache_size();
+        const unsigned int k_block = get_k_block_size(args);
 
-            // And tune to the presented problem size.
-            unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
-            _x_block = iceildiv(_Nsize, num_x_blocks);
+        // x_block: Work out how many rows (of length k_block) will fit in the L2
+        // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+        const unsigned int scaled_l2_size = (L2_size * 9) / 10;
+        const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height());
 
-            _x_block = iceildiv(_x_block, strategy::out_width());
-            _x_block *= strategy::out_width();
+        // .. if the L1 contents is bigger than the L2, just return a minimal size block.
+        if (k_block_area > scaled_l2_size) {
+            return strategy::out_width();
         }
 
-        // Work out the rounded size of M - needed for some buffers.
-        _Mround = iceildiv(_Msize, strategy::out_height());
-        _Mround *= strategy::out_height();
+        x_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block);
+
+        // Needs to be (at least a single) multiple of the kernel output width.
+        x_block /= strategy::out_width();
+        x_block = std::max(x_block, 1u) * strategy::out_width();
+
+        // And tune to the presented problem size.
+        unsigned int num_x_blocks = iceildiv(args._Nsize, x_block);
+        x_block = iceildiv(args._Nsize, num_x_blocks);
+
+        x_block = roundup(x_block, strategy::out_width());
+
+        assert(x_block > 0);
+
+        return x_block;
     }
 
+public:
+    GemmInterleaved(GemmInterleaved &) = delete;
+    GemmInterleaved & operator= (GemmInterleaved &) = delete;
+
+    /* Constructor */
+    GemmInterleaved(const GemmArgs &args, const OutputStage &os)
+                    : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+                      _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
+                      _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
+                      _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
+                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+                      _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
+                      _os(os) { }
+
+    /* Constructor without OutputStage */
+    GemmInterleaved(const GemmArgs &args)
+                    : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+                      _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
+                      _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
+                      _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
+                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+                      _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
+                      _os() { }
+
     // Interface implementation - Compulsory functions
 
     // Window size: Only the last thread should do a ragged block, so dole
@@ -224,8 +601,14 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
     // not multi for now (as this would cause problems with the buffer
     // manager).
     ndrange_t get_window_size() const override {
-        // _Mround is a multiple of out_height by definition.
-        return { (_Mround / strategy::out_height()) * _nbatches };
+        unsigned int row_blocks = (_Mround / strategy::out_height()) * _nbatches;
+
+        if (_thread_columns) {
+            return { row_blocks, iceildiv(_Nsize, strategy::out_width()) };
+        } else {
+            // _Mround is a multiple of out_height by definition.
+            return { row_blocks };
+        }
     }
 
     // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
@@ -235,117 +618,262 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
 
     // Execute
     void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override {
-        const auto start = work_range.get_position(0);
-        const auto end   = work_range.get_position_end(0);
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
+
+        /* Make sure we've been set up correctly. */
+        assert(_B_transposed);
+        assert(_working_space);
+        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+
+        /* Align if needed */
+        intptr_t working_space_v = reinterpret_cast<intptr_t>(_working_space);
+        if (working_space_v & 0x3f) {
+            intptr_t alignment_offset = 0x40 - (working_space_v & 0x3f);
+            working_space_bytes += alignment_offset;
+        }
+
         strategy strat(_ci);
 
-        blockwalker current(*this);
+        const auto start = work_range.get_position(0);
+        const auto end   = work_range.get_position_end(0);
 
         /* Translate 'start' and 'end' into a position within the batches and rows. */
         const unsigned int window_per_batch = _Mround / strategy::out_height();
         unsigned int batch_0   = start / window_per_batch;
         unsigned int batch_end = end   / window_per_batch;
 
-        /* Compute the M values to operate on */
-        unsigned int m_0   = (start - (batch_0 * window_per_batch)) * strategy::out_height();
-        unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
+        // In ThreadColumns mode, process work one horizontal strip at a time.
+        // Transpose the block of needed rows at the start, then do all the work on that block.
+        if (_thread_columns) {
+            const auto start_x = work_range.get_position(1) * strategy::out_width();
+            const auto end_x = std::min(work_range.get_position_end(1) * strategy::out_width(), _Nsize);
 
-        /* Make sure we've been set up correctly. */
-        assert(_B_transposed);
-        assert(_working_space);
-        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+            Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
+            Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()) +
+                                       (threadid * sizeof(Toi) * get_total_k_depth() * strategy::out_height()));
 
-        // Private buffers.  Treat working_space as an array of C buffers
-        // (one per thread) first, followed by the (window-divided) A
-        // buffer.
-        // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
-        Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
-        Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
+            for (unsigned int multi=0; multi<_nmulti; multi++) {
+                for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
+                    unsigned int kmax=std::min(k0+_k_block, _Ktotal);
 
-        const Toi *b_panel;
-        b_panel = _B_transposed;
+                    unsigned int rounded_width = roundup(_Nsize, strategy::out_width());
 
-        //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block);
+                    const bool first_pass = (k0==0);
+                    const bool last_pass  = (kmax==_Ktotal);
 
-        // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
-        int kern_k = 0;
+                    // Figure out how many "K" the kernel will actually process.
+                    unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll());
 
-        for (;!current.done();current.advance()) {
-            if (current.newkblock()) {
-#ifdef CYCLE_PROFILING
-                auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
-#endif
-                for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
-                    unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
-                    unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+                    const Toi *b_ptr = _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
 
-                    if (first_m >= last_m)
-                        continue;
+                    unsigned int batch     = batch_0;
+                    unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height();
 
-                    strat.transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * _k_block),
-                                              this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
-                                              this->_lda, first_m, last_m, current.k0(), current.kmax());
-                }
+                    for (unsigned int p=start; p<end; p++) {
+                        unsigned int end_row = std::min(start_row + strategy::out_height(), _Msize);
 
-                // Figure out how many "K" the kernel will actually process.
-                kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
-                kern_k *= strat.k_unroll();
+                        // Set up transposed 'A' block
+                        {
+#ifdef CYCLE_PROFILING
+                            auto p=prof.ScopedProfiler(PROFILE_PREPA, strategy::out_height() * (kmax-k0) * sizeof(Toi));
+#endif
+                            // See comment above on transform_type<> class: this extracts either 'transforms' or
+                            // 'transforms_quantized' as appropriate.
+                            typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
+
+                            if (_indirect_buf != nullptr) {
+                                transforms.PrepareA_indirect(a_panel,
+                                                             _indirect_buf + (multi * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
+                                                             _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
+                            } else if (_convolver) {
+                                transforms.PrepareA_convolution(a_panel,
+                                                                this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
+                                                                this->_lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
+                            } else {
+                                transforms.PrepareA(a_panel,
+                                                    this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
+                                                    this->_lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier());
+                            }
+                        }
+
+                        // Perform the kernel and merge step, either separately or together as required.
+                        kernel_and_merge<MergeStep, OutputStage>::run(
+                        #ifdef CYCLE_PROFILING
+                            prof,
+                        #endif
+                            // Strategy and panel pointers
+                            strat, a_panel, b_ptr, c_panel,
+                            // Result buffer pointers
+                            this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
+                            // K size, and M/N ranges
+                            kern_k, start_row, end_row, start_x, end_x,
+                            // Only do bias on the first pass
+                            ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
+                            // Only do activation on the last pass, and accumulation on any non-first pass.
+                            (last_pass ? _act : Activation()), !first_pass,
+                            // Pass in quantization parameters for requantizing kernels (others will ignore)
+                            _os, col_bias + (multi * _Nsize),
+                            // Accumulation buffer (not yet implemented on this path)
+                            static_cast<Tab *>(nullptr));
+
+                        /* Increment to the next block */
+                        start_row += strategy::out_height();
+                        if (start_row >= _Msize) {
+                            start_row = 0;
+                            batch++;
+                        }
+                    }
+                }
             }
+        } else {
+            blockwalker current(*this);
+
+            /* Compute the M values to operate on */
+            unsigned int m_0   = (start - (batch_0 * window_per_batch)) * strategy::out_height();
+            unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
+
+            // Private buffers.  Treat working_space as an array of C buffers
+            // (one per thread) first, followed by the (window-divided) A
+            // buffer.
+            // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
+            Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
+            Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
 
-            int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
+            const Toi *b_panel;
+            b_panel = _B_transposed;
 
-            /* Do the actual work. */
-            for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
-                unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
-                unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+            // newkblock() is always true on the first iteration, so these will be set properly on the first loop.
 
-                const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
+            // kern_k tracks the accumulation depth for the CURRENT K block a_panel_stride similarly tracks the total
+            // stride of the A panel (i.e.  with 4 added for cases with embedded row sums)
 
-                if (first_m >= last_m)
-                    continue;
+            // These are distinct from k_block and get_total_k_depth() which are based on the target K block size, and
+            // used for addressing inside a_panel.
 
-                for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
-                    unsigned int ymax = std::min(_Msize, y + strategy::out_height());
+            // In cases where K blocking is in use and the blocks are not all the same size, the (smaller) final block
+            // won't use all the memory allocated.
+            unsigned int kern_k = 0;
+            unsigned int a_panel_stride = 0;
 
-                    {
+            for (;!current.done();current.advance()) {
+                if (current.newkblock()) {
 #ifdef CYCLE_PROFILING
-                        auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+                    auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
 #endif
+                    // See comment above on transform_type<> class: this extracts either 'transforms' or
+                    // 'transforms_quantized' as appropriate.
+                    typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
+
+                    for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+                        unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
+                        unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+
+                        if (first_m >= last_m)
+                            continue;
+
+                        if (_indirect_buf != nullptr) {
+                            transforms.PrepareA_indirect(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
+                                                      _indirect_buf + (current.multi() * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
+                                                      _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
+                        } else if (_convolver) {
+                            transforms.PrepareA_convolution(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
+                                                      this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+                                                      this->_lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
+                        } else {
+                            transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
+                                                      this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+                                                      this->_lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier());
+                        }
+                    }
+
+                    // Figure out how many "K" the kernel will actually process.
+                    kern_k = roundup(current.kmax() - current.k0(), strategy::k_unroll());
 
-                        strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+                    // Requantizing GEMMs have the row sums built in to the
+                    // transposed data, so the stride between rows is 4 bytes
+                    // larger than the (rounded) K value.
 
-                        a_ptr += (strategy::out_height() * kern_k);
+                    if(std::is_same<OutputStage, Requantize32>::value) {
+                        a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Toi));
+                    } else {
+                        a_panel_stride = kern_k;
                     }
+                }
 
-                    {
-#ifdef CYCLE_PROFILING
-                        auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
-#endif
-                        /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */
-                        const bool first_pass = current.k0()==0;
-                        const bool last_pass  = current.kmax()==_Ksize;
-
-                        strat.transforms.Merge(this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),
-                                               c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),
-                                               ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
-                                               (last_pass ? _act : Activation()), !first_pass);
+                /* Do the actual work. */
+                for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+                    unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
+                    unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+
+                    const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * get_total_k_depth();
+
+                    if (first_m >= last_m)
+                        continue;
+
+                    // For the merge case we need to do this out_height() rows
+                    // at a time, as that is the size of our intermediate
+                    // buffer.  If we are not doing that, we can do all the
+                    // relevant rows in one go.
+                    unsigned int m_step = MergeStep ? strategy::out_height() : (last_m - first_m);
+
+                    // But in the case where we have an accumulation buffer, we can't do that after all, unless
+                    // there is no N blocking.
+                    if (_accumulation_buffer && ((current.x0() != 0) || (current.xmax() < _Nsize))) {
+                        m_step = strategy::out_height();
+                    }
+
+                    for (unsigned int y=first_m; y<last_m; y+=m_step) {
+                        unsigned int ymax = std::min(_Msize, y + m_step);
+
+                        const bool first_pass = (current.k0() == 0);
+                        const bool last_pass  = (current.kmax() == _Ktotal);
+
+                        // Pointer to appropriate part of result array.
+                        Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
+
+                        // If we are using an accumulation buffer, we don't pass the result buffer to ask the kernel
+                        // to write things into the accumulation buffer instead, except on the last pass.
+                        if (_accumulation_buffer && !last_pass) {
+                            result_ptr = nullptr;
+                        }
+
+                        // Perform the kernel and merge step, either separately or together as required.
+                        kernel_and_merge<MergeStep, OutputStage>::run(
+                        #ifdef CYCLE_PROFILING
+                            prof,
+                        #endif
+                            // Strategy and panel pointers
+                            strat, a_ptr, b_panel, c_panel,
+                            // Result buffer pointers
+                            result_ptr, this->_ldc,
+                            // K size, and M/N ranges
+                            kern_k, y, ymax, current.x0(), current.xmax(),
+                            // Only do bias on the first pass
+                            ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
+                            // Only do activation on the last pass, and accumulation on any non-first pass.
+                            (last_pass ? _act : Activation()), !first_pass,
+                            // Pass in quantization parameters for requantizing kernels (others will ignore)
+                            _os, col_bias + (current.multi() * _Nsize),
+                            // Accumulation buffer
+                            get_accumulation_buffer(y, current.x0(), batch, current.multi()) );
+
+                        a_ptr += (strategy::out_height() * a_panel_stride);
                     }
                 }
-            }
 
-            b_panel += (bblocks * strat.out_width() * kern_k);
+                b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
+            }
         }
     }
 
     // Interface implementation - working space
     size_t get_working_size() const override {
-        // In all cases, we need one A buffer plus a C buffer per thread.
-        size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
+        // In all cases, we need one A buffer plus a C buffer per thread, plus an accumulation buffer.
+        size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads) + get_accumulation_buffer_size();
 
-        size += 64; // Add on a cache line extra for alignment.
+        size += 128; // Add on two cache lines extra for alignment.
 
         return size;
     }
@@ -362,9 +890,22 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
         }
 
         working_space_bytes += diff;
+        working_space_int += diff;
 
         // Pretransposed case: just set internal pointer to parameter value.
         _working_space = reinterpret_cast<void *>(working_space_bytes);
+
+        // Set up accumulation buffer
+        if (get_accumulation_buffer_size() > 0) {
+            intptr_t acc_buff_int = working_space_int + get_a_working_size() + (get_c_working_size() * _maxthreads);
+            // Make sure the accumulation buffer is aligned (needed if the other blocks are not a multiple of cache line length)
+            if (acc_buff_int & 0x3F) {
+                acc_buff_int += (0x40 - (acc_buff_int & 0x3F));
+            }
+            _accumulation_buffer = reinterpret_cast<Tab *>(acc_buff_int);
+        } else {
+            _accumulation_buffer = nullptr;
+        }
     }
 
     // Interface implementation - pretransposed
@@ -376,56 +917,105 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
         return (_B_transposed==nullptr);
     }
 
-    // TODO: this could almost certainly be considerably simpler.
     size_t get_B_pretransposed_array_size() const override {
-        size_t total=0;
-        blockwalker current(*this);
+        unsigned int x_size = roundup(_Nsize, strategy::out_width());
 
-        do {
-            /* Figure out the size of each block. */
-            unsigned int x_size = (current.xmax() - current.x0());
-            unsigned int k_size = (current.kmax() - current.k0());
+        return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size();
+    }
 
-            /* Round sizes up as needed. */
-            x_size = iceildiv(x_size, strategy::out_width());
-            x_size *= strategy::out_width();
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            col_bias = reinterpret_cast<int32_t *>(in_buffer);
 
-            k_size = iceildiv(k_size, strategy::k_unroll());
-            k_size *= strategy::k_unroll();
+            Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
 
-            total += x_size * k_size * sizeof(Toi);
-        } while (current.advance());
+            for (unsigned int i=0; i<_nmulti; i++) {
+                // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
+                compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
+            }
+        }
 
-        return total;
-    }
+        // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+        Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        _B_transposed = buffer;
 
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         blockwalker current(*this);
-        Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
-        _B_transposed = buffer;
         strategy strat(_ci);
 
         do {
             /* Figure out the size of each block. */
-            unsigned int x_size = (current.xmax() - current.x0());
             unsigned int k_size = (current.kmax() - current.k0());
 
-            /* Round sizes up as needed. */
-            x_size = iceildiv(x_size, strategy::out_width());
-            x_size *= strategy::out_width();
+            // We need to insert padding at the end of each K section.
+            // The computation needed is a little delicate - the coordinates from the block walker are expressed in
+            // terms of the full, padded, _Ktotal.
+            // But we need to transform each section with reference to the original, unpadded, input, letting the
+            // transform pad each section as needed.
+
+            // This is needed for computations below.
+            const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll());
+
+            // The expected output format is also an entire <out_width> columns interleaved, then the next set of
+            // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
+            // a time.
+            for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ){
+                unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
+
+                // Track where we are and how much work is left.
+                unsigned int kpos  = current.k0();
+                unsigned int kleft = k_size;
+
+                while (kleft) {
+                    // Which section are we in?  Based on the rounded-up section size.
+                    unsigned int k_section_base = kpos / rounded_section_size;
+                    // How far into the section are we?
+                    unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
+
+                    // We will either copy the rest of this section, or to the end of the requested length.
+                    unsigned int k_length = std::min(_Ksize - k_offset, kleft);
+
+                    strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
+                                              x0, xmax,
+                                              (k_section_base * _Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
+                                              (k_section_base * _Ksize) + k_offset + k_length);   // K end point - starting point plus length computed above.
 
-            k_size = iceildiv(k_size, strategy::k_unroll());
-            k_size *= strategy::k_unroll();
+                    // We need to modify our position based on the ROUNDED version of what we just did.
+                    unsigned int padded_length = roundup(k_length, strategy::k_unroll());
 
-            strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
-                                      current.x0(), current.xmax(), current.k0(), current.kmax());
+                    buffer += strategy::out_width() * padded_length;
 
-            buffer += (x_size * k_size);
+                    kpos  += padded_length;
+                    kleft -= padded_length;
+                }
+            }
         } while (current.advance());
     }
 
     void set_pretransposed_B_data(void *in_buffer) override {
-        _B_transposed = reinterpret_cast<Toi *>(in_buffer);
+        // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+        _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        col_bias = reinterpret_cast<int32_t *>(in_buffer);
+    }
+
+    void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
+
+            qp->bias = bias;
+            qp->bias_multi_stride = bias_multi_stride;
+        }
+    }
+
+    void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
+        assert(string_len == _Ksize);
+        _indirect_buf = ptr;
+    }
+
+    void set_convolution_parameters(ConvolutionParameters parms) override {
+        assert(parms.input_channels == _Ksize);
+        _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
     }
 
     // Estimate cycles for given problem given provided parameters
@@ -454,4 +1044,14 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
     }
 };
 
+// Aliases for the variations
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
+using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>;
+
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strategy, To, Tr, Requantize32, false>;
+
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedQuantized = GemmInterleaved<strategy, To, Tr, Requantize32>;
+
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
index bdccd05326..b71f390ab9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
@@ -250,7 +250,8 @@ class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> {
                         first_m,
                         last_m,
                         current.k0(),
-                        current.kmax());
+                        current.kmax(),
+                        0);
                 }
             }
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
index 04cac6095c..05c5116bf3 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
@@ -25,68 +25,151 @@
 
 #include "arm_gemm.hpp"
 
-#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
-#include "kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp"
-#include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp"
-#include "kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp"
+#include "kernels/a64_gemm_s16_8x12.hpp"
+#include "kernels/a64_gemm_s8_4x4.hpp"
+#include "kernels/a64_gemm_s8_8x12.hpp"
+#include "kernels/a64_hybrid_s8qa_dot_4x16.hpp"
+#include "kernels/a64_hybrid_s8qs_dot_6x16.hpp"
+#include "kernels/a64_hybrid_s8s32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp"
 
+#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_s8qa_dot_4x4VL.hpp"
+#include "kernels/sve_hybrid_s8qs_dot_6x4VL.hpp"
+#include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp"
+
+#include "gemm_hybrid_indirect.hpp"
 #include "gemm_hybrid_quantized.hpp"
+#include "gemm_hybrid_quantized_inline.hpp"
+#include "gemm_interleaved.hpp"
 #include "quantize_wrapper.hpp"
+#include "utils.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<int8_t, int8_t, Requantize32> gemm_qint8_methods[] =
 {
 #ifdef __ARM_FEATURE_SVE
+#ifdef MMLA_INT8
 {
-    GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "smallK_hybrid_s8s32_dot_1VLx8",
-    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64; },
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_s8s32_mmla_8x3VL",
+    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_s8s32_dot_1VLx8, int8_t, int8_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t>(args, qp); }
 },
+#endif
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "hybrid_s8s32_dot_4VLx4",
-    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize>=16; },
-    [](const GemmArgs &args, const Requantize32 &) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_s8s32_dot_4VLx4, int8_t, int8_t>(args, qp); }
+    "sve_smallK_hybrid_s8s32_dot_8x1VL",
+    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_sve_smallK_hybrid_s8s32_dot_8x1VL, int8_t, int8_t>(args, qp); }
+},
+#ifdef SVE2
+{
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_s8qs_dot_6x4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_symmetric(qp); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qs_dot_6x4VL, int8_t, int8_t, Requantize32>(args, qp); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_s8qa_dot_4x4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qa_dot_4x4VL, int8_t, int8_t, Requantize32>(args, qp); }
 },
 #endif
 {
-    GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "smallK_hybrid_s8s32_dot_4x8",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_s8s32_dot_6x4VL",
+    nullptr,
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, Requantize32, true>(args, qp); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_s8s32_dot_8x3VL",
+    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_s8s32_dot_4x8, int8_t, int8_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t>(args, qp); }
 },
+#endif // SVE
+#ifdef MMLA_INT8
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_interleaved_s8s32_mmla_8x12",
+    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t>(args, qp); }
+},
+#endif
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "smallK_hybrid_s8s32_dot_4x6",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
+    "a64_smallK_hybrid_s8s32_dot_8x4",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_s8s32_dot_4x6, int8_t, int8_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_s8s32_dot_8x4, int8_t, int8_t>(args, qp); }
 },
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "hybrid_s8s32_dot_16x4",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._Ksize>=16; },
-    [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; },
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_s8s32_dot_16x4, int8_t, int8_t>(args, qp); }
+    "a64_smallK_hybrid_s8s32_dot_6x4",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_s8s32_dot_6x4, int8_t, int8_t>(args, qp); }
 },
-/** QUANTIZE_WRAPPER_2D enables 2D parallelisation hint for IScheduler in NEGEMMAssemblyDispatch */
 {
-    GemmMethod::QUANTIZE_WRAPPER_2D,
-    "quantized_wrapper_2d",
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_s16_8x12",
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8);},
-    [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<int8_t, int8_t, int32_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() == CPUModel::A53; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s16_8x12, int8_t, int8_t>(args, qp); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8qs_dot_6x16",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_symmetric(qp); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, Requantize32>(args, qp); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8qa_dot_4x16",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, Requantize32>(args, qp); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8s32_dot_6x16",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, Requantize32, true>(args, qp); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_s8_8x12",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t>(args, qp); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_s8_4x4",
+    nullptr,
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_4x4, int8_t, int8_t>(args, qp); }
 },
 {
     GemmMethod::QUANTIZE_WRAPPER,
     "quantized_wrapper",
-    nullptr,
+    [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; },
     nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<int8_t, int8_t, int32_t>(args, qp); }
 },
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
index 0125f9c5db..7342fda5d1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
@@ -25,13 +25,25 @@
 
 #include "arm_gemm.hpp"
 
-#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
-#include "kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp"
-#include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp"
-#include "kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp"
+#include "kernels/a64_gemm_u16_8x12.hpp"
+#include "kernels/a64_gemm_u8_4x4.hpp"
+#include "kernels/a64_gemm_u8_8x12.hpp"
+#include "kernels/a64_hybrid_u8qa_dot_4x16.hpp"
+#include "kernels/a64_hybrid_u8u32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp"
 
+#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_u8qa_dot_4x4VL.hpp"
+#include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp"
+
+#include "gemm_hybrid_indirect.hpp"
 #include "gemm_hybrid_quantized.hpp"
+#include "gemm_hybrid_quantized_inline.hpp"
+#include "gemm_interleaved.hpp"
 #include "quantize_wrapper.hpp"
 
 namespace arm_gemm {
@@ -39,54 +51,108 @@ namespace arm_gemm {
 static const GemmImplementation<uint8_t, uint8_t, Requantize32> gemm_quint8_methods[] =
 {
 #ifdef __ARM_FEATURE_SVE
+#ifdef MMLA_INT8
 {
-    GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "smallK_hybrid_u8u32_dot_1VLx8",
-    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64; },
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_u8u32_mmla_8x3VL",
+    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_u8u32_dot_1VLx8, uint8_t, uint8_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t>(args, qp); }
 },
+#endif
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "hybrid_u8u32_dot_4VLx4",
-    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize>=16; },
-    [](const GemmArgs &args, const Requantize32 &) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_u8u32_dot_4VLx4, uint8_t, uint8_t>(args, qp); }
+    "sve_smallK_hybrid_u8u32_dot_8x1VL",
+    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_sve_smallK_hybrid_u8u32_dot_8x1VL, uint8_t, uint8_t>(args, qp); }
+},
+#ifdef SVE2 // Requantizing kernels include some SVE2 only instructions (SQRDMULH, SRSHL)
+{
+    GemmMethod::GEMM_HYBRID, 
+    "sve_hybrid_u8qa_dot_4x4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8qa_dot_4x4VL, uint8_t, uint8_t, Requantize32>(args, qp); }
 },
 #endif
 {
-    GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "smallK_hybrid_u8u32_dot_4x8",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
+    GemmMethod::GEMM_HYBRID, 
+    "sve_hybrid_u8u32_dot_6x4VL",
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_u8u32_dot_4x8, uint8_t, uint8_t>(args, qp); }
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_u8u32_dot_8x3VL",
+    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>4); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t>(args, qp); }
 },
+#endif
+#ifdef MMLA_INT8
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_interleaved_u8u32_mmla_8x12",
+    [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t>(args, qp); }
+},
+#endif
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "smallK_hybrid_u8u32_dot_4x6",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
+    "a64_smallK_hybrid_u8u32_dot_8x4",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_u8u32_dot_4x6, uint8_t, uint8_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_u8u32_dot_8x4, uint8_t, uint8_t>(args, qp); }
 },
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
-    "hybrid_u8u32_dot_16x4",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._Ksize>=16; },
-    [](const GemmArgs &args, const Requantize32 &) { return ((args._Nsize<=256) && (args._Ksize>128)) || (args._maxthreads >= 8); },
-    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_u8u32_dot_16x4, uint8_t, uint8_t>(args, qp); }
+    "a64_smallK_hybrid_u8u32_dot_6x4",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_u8u32_dot_6x4, uint8_t, uint8_t>(args, qp); }
 },
-/** QUANTIZE_WRAPPER_2D enables 2D parallelisation hint for IScheduler in NEGEMMAssemblyDispatch */
 {
-    GemmMethod::QUANTIZE_WRAPPER_2D,
-    "quantized_wrapper_2d",
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_u16_8x12",
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8);},
-    [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<uint8_t, uint8_t, uint32_t>(args, qp); }
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() == CPUModel::A53; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u16_8x12, uint8_t, uint8_t>(args, qp); },
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_u8qa_dot_4x16",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, uint8_t, uint8_t, Requantize32>(args, qp); }
+},
+{
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_u8u32_dot_6x16",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
+    [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_u8_8x12",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, uint8_t, uint8_t>(args, qp); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_gemm_u8_4x4",
+    nullptr,
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_4x4, uint8_t, uint8_t>(args, qp); }
 },
 {
     GemmMethod::QUANTIZE_WRAPPER,
     "quantized_wrapper",
-    nullptr,
+    [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; },
     nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<uint8_t, uint8_t, uint32_t>(args, qp); }
 },
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 5e06443e19..10a35e7a11 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -28,17 +28,17 @@
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
 
-#include "kernels/a64_gemm_u16_12x8.hpp"
+#include "kernels/a64_gemm_u16_8x12.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<uint16_t, uint32_t> gemm_u16_methods[] = {
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_u16_12x8",
+    "a64_gemm_u16_8x12",
     nullptr,
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u16_8x12, uint16_t, uint32_t>(args); }
 },
 {
     GemmMethod::DEFAULT,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index 88726b1448..c300b8cdf9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -29,18 +29,20 @@
 #include "gemm_interleaved.hpp"
 #include "gemm_interleaved_pretransposed_2d.hpp"
 #include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
 
-#include "kernels/a64_gemm_u16_12x8.hpp"
-#include "kernels/a64_gemm_u8_12x8.hpp"
+#include "kernels/a64_gemm_u16_8x12.hpp"
 #include "kernels/a64_gemm_u8_4x4.hpp"
-#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
-#include "kernels/a64_interleaved_u8u32_mmla_12x8.hpp"
-#include "kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp"
-#include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp"
-#include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
-#include "kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp"
-#include "kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp"
+#include "kernels/a64_gemm_u8_8x12.hpp"
+#include "kernels/a64_hybrid_u8u32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp"
+
+#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp"
+#include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp"
 
 namespace arm_gemm {
 
@@ -49,98 +51,84 @@ static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
 #ifdef MMLA_INT8
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_u8u32_mmla_3VLx8",
+    "sve_interleaved_u8u32_mmla_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>8); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_u8u32_mmla_3VLx8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint32_t>(args); }
 },
 #endif
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_u8u32_dot_1VLx8",
-    [](const GemmArgs &args) { return args._Ksize<=64; },
+    "smallK_hybrid_u8u32_dot_8x1VL",
+    [](const GemmArgs &args) { return args._Ksize<=64 && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_1VLx8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_u8u32_dot_8x1VL, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_u8u32_dot_4VLx4",
-    [](const GemmArgs &args) { return args._Ksize>=16; },
+    "sve_hybrid_u8u32_dot_6x4VL",
+    nullptr,
     [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_u8u32_dot_4VLx4, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_u8u32_dot_3VLx8",
+    "sve_interleaved_u8u32_dot_8x3VL",
     [](const GemmArgs &args) { return (args._Ksize>4); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint32_t>(args); }
 },
 #endif
 #ifdef MMLA_INT8
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "interleaved_u8u32_mmla_12x8",
+    "a64_interleaved_u8u32_mmla_8x12",
     [](const GemmArgs &args) { return (args._Ksize>8); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_u8u32_mmla_12x8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint32_t>(args); }
 },
 #endif
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_u8u32_dot_4x8",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
+    "a64_smallK_hybrid_u8u32_dot_8x4",
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_4x8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_u8u32_dot_8x4, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_u8u32_dot_4x6",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
+    "a64_smallK_hybrid_u8u32_dot_6x4",
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
     nullptr,
-    [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_4x6, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_u8u32_dot_6x4, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_u16_12x8",
+    "a64_gemm_u16_8x12",
     nullptr,
     [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53; },
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t>(args); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u16_8x12, uint8_t, uint32_t>(args); },
 },
 {
     GemmMethod::GEMM_HYBRID,
-    "hybrid_u8u32_dot_16x4",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16; },
-    [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
-    [](const GemmArgs &args) { return new GemmHybrid<hybrid_u8u32_dot_16x4, uint8_t, uint32_t>(args); }
-},
-{
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "gemm_u8_12x8_2d",
+    "a64_hybrid_u8u32_dot_6x16",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8) ; },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_u8_12x8, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_u8_12x8_1d",
+    "a64_gemm_u8_8x12",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(args); }
-},
-{
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "gemm_u8_4x4_2d",
-    nullptr,
-    [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8); },
-    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_u8_4x4, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_8x12, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_u8_4x4_1d",
+    "a64_gemm_u8_4x4",
     nullptr,
     nullptr,
-    [](const GemmArgs &args) { return new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(args); }
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_4x4, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::DEFAULT,
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index 47909cdaeb..9de44fcb73 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -46,46 +46,39 @@ class GemvPretransposed : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
     typedef typename strategy::result_type Tri;
 
-    const unsigned int _Nsize;
-    const unsigned int _Ksize;
-
-    const unsigned int _nmultis;
-
-    const Activation _act;
-
-    const CPUInfo * const _ci;
+    const GemmArgs     _args;
 
     const unsigned int _buffer_per_multi;
 
-    unsigned int m_block=0;
+    unsigned int k_block=0;
     unsigned int n_block=0;
 
-    const Toi *_A_pretransposed = nullptr;
+    const Toi *_B_pretransposed = nullptr;
 
 public:
     GemvPretransposed(GemvPretransposed &) = delete;
     GemvPretransposed & operator= (GemvPretransposed &) = delete;
 
     GemvPretransposed(const GemmArgs &args)
-                      : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _act(args._act), _ci(args._ci),
-                        _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave()) * strategy::A_interleave()) {
+                      : _args(args),
+                        _buffer_per_multi(args._Ksize * roundup(args._Nsize, strategy::out_width())) {
         /* For now don't do any blocking. TODO: figure out if we should. */
-        if (args._cfg && args._cfg->inner_block_size) {
-            m_block = args._cfg->inner_block_size;
+        if (strategy::supports_accumulate() && args._cfg && args._cfg->inner_block_size) {
+            k_block = args._cfg->inner_block_size;
         } else {
-            m_block = _Ksize;
+            k_block = args._Ksize;
         }
 
         if (args._cfg && args._cfg->outer_block_size) {
             n_block = args._cfg->outer_block_size;
         } else {
-            n_block = _Nsize;
+            n_block = args._Nsize;
         }
     }
 
     // Window is number of out_width blocks, times number of multis.
     ndrange_t get_window_size() const override {
-        return { iceildiv(_Nsize, strategy::out_width()) * _nmultis };
+        return { iceildiv(_args._Nsize, strategy::out_width()) * _args._nmulti };
     }
 
     // Actually execute the GEMV.
@@ -93,13 +86,13 @@ class GemvPretransposed : public GemmCommon<To, Tr> {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
-        strategy strat(_ci);
+        strategy strat(_args._ci);
 
         const auto start = work_range.get_position(0);
         const auto end   = work_range.get_position_end(0);
 
         /* Break the window values down into multis of interest... */
-        const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
+        const unsigned int window_per_multi = iceildiv(_args._Nsize, strategy::out_width());
         const unsigned int multi_0    = start / window_per_multi;
         const unsigned int multi_end  = end   / window_per_multi;
 
@@ -111,36 +104,25 @@ class GemvPretransposed : public GemmCommon<To, Tr> {
 
         for (unsigned int multi=multi_0; multi<=multi_end; multi++) {
             const unsigned int n_start = (multi==multi_0) ? n_0 : 0;
-            const unsigned int n_end = (multi==multi_end) ? n_max : _Nsize;
+            const unsigned int n_end = (multi==multi_end) ? n_max : _args._Nsize;
 
             if (n_end <= n_start)
                 continue;
 
-            for (unsigned int m0=0; m0<_Ksize; m0+=m_block) {
-                unsigned int mmax = std::min(m0 + m_block, _Ksize);
+            for (unsigned int k0=0; k0<_args._Ksize; k0+=k_block) {
+                unsigned int kmax = std::min(k0 + k_block, _args._Ksize);
 
                 for (unsigned int n=n_start; n<n_end; n+=n_block) {
                     unsigned int nmax = std::min(n + n_block, n_end);
 #ifdef CYCLE_PROFILING
-                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax-m0) * (nmax-n));
+                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (kmax-k0) * (nmax-n));
 #endif
-                    /* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */
-                    strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave()),
-                                 (_Ksize * strategy::A_interleave()),
-                                 this->_Aptr + (multi * this->_A_multi_stride) + m0,
+                    strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + k0,
+                                 _B_pretransposed + (multi * _buffer_per_multi) + (n * roundup(_args._Ksize, strategy::k_unroll())) + (k0 * strategy::out_width()),
                                  this->_Cptr + (multi * this->_C_multi_stride) + n,
-                                 static_cast<Tr>(0), (mmax-m0), (nmax-n));
-
-                    // Handle activation separately for now
-                    if (this->_bias) {
-                        activator<true>(this->_Cptr + (multi * this->_C_multi_stride) + n, 0,
-                                        this->_bias + (multi * this->_bias_multi_stride) + n,
-                                        _act, 1, (nmax-n));
-                    } else {
-                        activator<false>(this->_Cptr + (multi * this->_C_multi_stride) + n, 0,
-                                         static_cast<const Tr *>(nullptr),
-                                         _act, 1, (nmax-n));
-                    }
+                                 (nmax - n), (kmax-k0),
+                                 this->_bias ? this->_bias + (multi * this->_bias_multi_stride) + n : nullptr,
+                                 _args._act, (k0 != 0));
                 }
             }
         }
@@ -152,33 +134,27 @@ class GemvPretransposed : public GemmCommon<To, Tr> {
     }
 
     bool B_pretranspose_required() const override {
-        /* Transpose is required if _A_pretransposed is still nullptr */
-        return (_A_pretransposed == nullptr);
+        /* Transpose is required if _B_pretransposed is still nullptr */
+        return (_B_pretransposed == nullptr);
     }
 
     size_t get_B_pretransposed_array_size() const override {
-        return _buffer_per_multi * _nmultis * sizeof(To);
+        return _buffer_per_multi * _args._nmulti * sizeof(To);
     }
 
     void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
-        Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
-
-        for (unsigned int multi=0; multi<_nmultis; multi++) {
-            /* Reverse sense here as we are dealing with B rather than A.  So if
-             * strategy::A_transpose is false and _trB is false, we still
-             * transpose.  */
-            if (strategy::A_transpose()) {
-                Transform<strategy::A_interleave(), strategy::A_block(), false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
-            } else {
-                Transform<strategy::A_interleave(), strategy::A_block(), true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
-            }
+        Toi *B_buffer = reinterpret_cast<Toi *>(buffer);
+        strategy strat(_args._ci);
+
+        for (unsigned int multi=0; multi<_args._nmulti; multi++) {
+            strat.transforms.PrepareB(B_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _args._Nsize, 0, _args._Ksize);
         }
 
-        _A_pretransposed = A_buffer;
+        _B_pretransposed = B_buffer;
     }
 
     void set_pretransposed_B_data(void *buffer) override {
-        _A_pretransposed = reinterpret_cast<Toi *>(buffer);
+        _B_pretransposed = reinterpret_cast<Toi *>(buffer);
     }
 };
 
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp
new file mode 100644
index 0000000000..074299997d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017-2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#if (defined(__GNUC__) && (__GNUC__ >= 7))
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+#endif
+
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template<>
+void interleave_block<6, 1, VLType::None, false>(
+    float * &outptr, const float * const * in, size_t width, size_t height,
+    size_t row_offset, bool
+)
+{
+    const float *inptr0 = in[0] + row_offset;
+    const float *inptr1 = in[1] + row_offset;
+    const float *inptr2 = in[2] + row_offset;
+    const float *inptr3 = in[3] + row_offset;
+    const float *inptr4 = in[4] + row_offset;
+    const float *inptr5 = in[5] + row_offset;
+
+    // Cope with ragged cases by aliasing the first row (which is always valid).
+    // The nonsense output produced will be suppressed later anyway.
+    switch (height) {
+        case 1:
+            inptr1 = inptr0;
+            // fall through
+        case 2:
+            inptr2 = inptr0;
+            // fall through
+        case 3:
+            inptr3 = inptr0;
+            // fall through
+        case 4:
+            inptr4 = inptr0;
+            // fall through
+        case 5:
+            inptr5 = inptr0;
+            // fall through
+        default:
+        case 6:
+            break;
+    }
+
+    //prefetch_2x(inptr0);
+    //prefetch_2x(inptr1);
+    //prefetch_2x(inptr2);
+    //prefetch_2x(inptr3);
+    //prefetch_2x(inptr4);
+    //prefetch_2x(inptr5);
+
+    for (;width>7;width-=8) {
+        __asm __volatile (
+            // Load up 8 elements (2 vectors) from each of 8 sources.
+            "VLD1.32	{d0-d3}, [%[inptr0]]!\n"   // q0=A0A1A2A3
+            "VLD1.32	{d4-d7}, [%[inptr1]]!\n"   // q2=B0B1B2B3
+            "VLD1.32	{d8-d11}, [%[inptr2]]!\n"  // q4=C0C1C2C3
+            "VZIP.32	q0, q4\n"     // q0=A0C0A1C1, q4 = A2C2A3C3
+            "VLD1.32	{d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
+            "VZIP.32	q2, q6\n"     // q2=B0D0B1D1, q6 = B2D2B3D3
+            "VLD1.32	{d16-d19}, [%[inptr4]]!\n"
+            "VLD1.32	{d20-d23}, [%[inptr5]]!\n"
+            "VZIP.32	q8, q10\n"    // q8=E0F0E1F1, q10 = E2F2E3F3
+            ASM_PREFETCH("[%[inptr0], #128]")
+            "VZIP.32	q0, q2\n"    // q0 = A0B0C0D0, q2 = A1B1C1D1
+
+            // Store first elements
+            "VST1.32	{d0-d1}, [%[outptr]]!\n"
+            "VST1.32	{d16}, [%[outptr]]!\n"
+
+            "VZIP.32	q4, q6\n"    // q4 = A2B2C2D2, q6 = A3B3C3D3
+
+            // Store second elements
+            "VST1.32	{d4-d5}, [%[outptr]]!\n"
+            "VZIP.32	q1, q5\n"
+            ASM_PREFETCH("[%[inptr1], #128]")
+            "VST1.32	{d17}, [%[outptr]]!\n"
+            "VZIP.32	q3, q7\n"
+
+            // Store third elements
+            "VZIP.32	q9, q11\n"
+            "VST1.32	{d8-d9}, [%[outptr]]!\n"
+            "VZIP.32	q1, q3\n"
+            ASM_PREFETCH("[%[inptr2], #128]")
+            "VST1.32	{d20}, [%[outptr]]!\n"
+
+            // Store fourth elements
+            "VZIP.32	q5, q7\n"
+            "VST1.32	{d12-d13}, [%[outptr]]!\n"
+            ASM_PREFETCH("[%[inptr3], #128]")
+            "VST1.32	{d21}, [%[outptr]]!\n"
+
+            // Fifth
+            "VST1.32	{d2-d3}, [%[outptr]]!\n"
+            ASM_PREFETCH("[%[inptr4], #128]")
+            "VST1.32	{d18}, [%[outptr]]!\n"
+
+            // Sixth
+            "VST1.32	{d6-d7}, [%[outptr]]!\n"
+            ASM_PREFETCH("[%[inptr5], #128]")
+            "VST1.32	{d19}, [%[outptr]]!\n"
+
+            // Seventh
+            "VST1.32	{d10-d11}, [%[outptr]]!\n"
+            "VST1.32	{d22}, [%[outptr]]!\n"
+
+            // Eighth
+            "VST1.32	{d14-d15}, [%[outptr]]!\n"
+            "VST1.32	{d23}, [%[outptr]]!\n"
+
+            : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+              [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
+            :
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "memory"
+        );
+    }
+
+    for (;width>0;width--) {
+        *outptr++ = *inptr0++;
+        *outptr++ = *inptr1++;
+        *outptr++ = *inptr2++;
+        *outptr++ = *inptr3++;
+        *outptr++ = *inptr4++;
+        *outptr++ = *inptr5++;
+    }
+}
+
+#endif  // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
new file mode 100644
index 0000000000..8054c2b96b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<4, 16, VLType::None, false>(
+  int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x22, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x4\n"
+      "ldr x21, [%x[in], #0x8]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x10]\n"
+      "ldr x19, [%x[in], #0x18]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "add x19, x19, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x19, x22\n"
+      "cmp %x[height], #0x2\n"
+      "csel x21, x21, x22, GE\n"
+      "csel x20, x20, x22, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "cmp %x[width], #0x10\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x19, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "prfm pldl1keep, [x19, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q19, [x22], #0x10\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q18, [x21], #0x10\n"
+      "ldr q17, [x20], #0x10\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x19], #0x10\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "prfm pldl1keep, [x19, #0x70]\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "cmp %x[width], #0x10\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 12f\n"
+      "tbz %x[width], #3, 7f\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d18, [x21], #0x8\n"
+      "ldr d17, [x20], #0x8\n"
+      "ldr d16, [x19], #0x8\n"
+      "tbz %x[width], #2, 5f\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "ld1 { v18.s }[2], [x21], #0x4\n"
+      "ld1 { v17.s }[2], [x20], #0x4\n"
+      "ld1 { v16.s }[2], [x19], #0x4\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v19.h }[6], [x22], #0x2\n"
+      "ld1 { v18.h }[6], [x21], #0x2\n"
+      "ld1 { v17.h }[6], [x20], #0x2\n"
+      "ld1 { v16.h }[6], [x19], #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[14], [x22]\n"
+      "ld1 { v18.b }[14], [x21]\n"
+      "ld1 { v17.b }[14], [x20]\n"
+      "ld1 { v16.b }[14], [x19]\n"
+      "b 11f\n"
+      "4:"  // odd_loads_1_12
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[12], [x22]\n"
+      "ld1 { v18.b }[12], [x21]\n"
+      "ld1 { v17.b }[12], [x20]\n"
+      "ld1 { v16.b }[12], [x19]\n"
+      "b 11f\n"
+      "5:"  // odd_loads_2_8
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v19.h }[4], [x22], #0x2\n"
+      "ld1 { v18.h }[4], [x21], #0x2\n"
+      "ld1 { v17.h }[4], [x20], #0x2\n"
+      "ld1 { v16.h }[4], [x19], #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[10], [x22]\n"
+      "ld1 { v18.b }[10], [x21]\n"
+      "ld1 { v17.b }[10], [x20]\n"
+      "ld1 { v16.b }[10], [x19]\n"
+      "b 11f\n"
+      "6:"  // odd_loads_1_8
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[8], [x22]\n"
+      "ld1 { v18.b }[8], [x21]\n"
+      "ld1 { v17.b }[8], [x20]\n"
+      "ld1 { v16.b }[8], [x19]\n"
+      "b 11f\n"
+      "7:"  // odd_loads_4_0
+      "tbz %x[width], #2, 9f\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s18, [x21], #0x4\n"
+      "ldr s17, [x20], #0x4\n"
+      "ldr s16, [x19], #0x4\n"
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v19.h }[2], [x22], #0x2\n"
+      "ld1 { v18.h }[2], [x21], #0x2\n"
+      "ld1 { v17.h }[2], [x20], #0x2\n"
+      "ld1 { v16.h }[2], [x19], #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[6], [x22]\n"
+      "ld1 { v18.b }[6], [x21]\n"
+      "ld1 { v17.b }[6], [x20]\n"
+      "ld1 { v16.b }[6], [x19]\n"
+      "b 11f\n"
+      "8:"  // odd_loads_1_4
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[4], [x22]\n"
+      "ld1 { v18.b }[4], [x21]\n"
+      "ld1 { v17.b }[4], [x20]\n"
+      "ld1 { v16.b }[4], [x19]\n"
+      "b 11f\n"
+      "9:"  // odd_loads_2_0
+      "tbz %x[width], #1, 10f\n"
+      "ldr h19, [x22], #0x2\n"
+      "ldr h18, [x21], #0x2\n"
+      "ldr h17, [x20], #0x2\n"
+      "ldr h16, [x19], #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v19.b }[2], [x22]\n"
+      "ld1 { v18.b }[2], [x21]\n"
+      "ld1 { v17.b }[2], [x20]\n"
+      "ld1 { v16.b }[2], [x19]\n"
+      "b 11f\n"
+      "10:"  // odd_loads_1_0
+      "ldr b19, [x22, #0x0]\n"
+      "ldr b18, [x21, #0x0]\n"
+      "ldr b17, [x20, #0x0]\n"
+      "ldr b16, [x19, #0x0]\n"
+      "11:"  // Odd load end
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "12:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "x19", "x20", "x21", "x22"
+    );
+}
+
+template<>
+void interleave_block<4, 16, VLType::None, false>(
+  uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  int8_t * &out_cast = reinterpret_cast<int8_t * &>(out_ptr);
+  const int8_t * const * in_cast = reinterpret_cast<const int8_t * const *>(in);
+
+  interleave_block<4, 16, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false);
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
new file mode 100644
index 0000000000..1650916f9f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<4, 16, VLType::None, true>(
+  int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v28.8h, #0x0\n"
+      "ldr x23, [%x[in], #0x0]\n"
+      "mov x22, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "ldr x21, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x4\n"
+      "movi v26.8h, #0x0\n"
+      "ldr x20, [%x[in], #0x10]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "movi v25.8h, #0x0\n"
+      "ldr x19, [%x[in], #0x18]\n"
+      "movi v24.4s, #0x0\n"
+      "add x21, x21, %x[row_offset]\n"
+      "movi v23.4s, #0x0\n"
+      "add x20, x20, %x[row_offset]\n"
+      "movi v22.4s, #0x0\n"
+      "add x19, x19, %x[row_offset]\n"
+      "movi v21.4s, #0x0\n"
+      "beq 1f\n"
+      "mov x19, x23\n"
+      "cmp %x[height], #0x2\n"
+      "csel x21, x21, x23, GE\n"
+      "csel x20, x20, x23, GT\n"
+      "1:"  // no_pointer_adj
+      "movi v20.4s, #0x0\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x19, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "prfm pldl1keep, [x19, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x10\n"
+      "ld1 { v20.4s }, [%x[out_ptr]]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x10\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x22, #0x7e\n"
+      "ble 4f\n"
+      "sadalp v24.4s, v28.8h\n"
+      "movi v28.8h, #0x0\n"
+      "sadalp v23.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "sadalp v22.4s, v26.8h\n"
+      "movi v26.8h, #0x0\n"
+      "sadalp v21.4s, v25.8h\n"
+      "movi v25.8h, #0x0\n"
+      "mov x22, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q19, [x23], #0x10\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x21], #0x10\n"
+      "ldr q17, [x20], #0x10\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x19], #0x10\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "sadalp v28.8h, v19.16b\n"
+      "prfm pldl1keep, [x19, #0x70]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "sadalp v27.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "sadalp v26.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "sadalp v25.8h, v16.16b\n"
+      "add x22, x22, #0x1\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "cmp %x[width], #0x10\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 14f\n"
+      "tbz %x[width], #3, 9f\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d18, [x21], #0x8\n"
+      "ldr d17, [x20], #0x8\n"
+      "ldr d16, [x19], #0x8\n"
+      "tbz %x[width], #2, 7f\n"
+      "ld1 { v19.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x21], #0x4\n"
+      "ld1 { v17.s }[2], [x20], #0x4\n"
+      "ld1 { v16.s }[2], [x19], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v19.h }[6], [x23], #0x2\n"
+      "ld1 { v18.h }[6], [x21], #0x2\n"
+      "ld1 { v17.h }[6], [x20], #0x2\n"
+      "ld1 { v16.h }[6], [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[14], [x23]\n"
+      "ld1 { v18.b }[14], [x21]\n"
+      "ld1 { v17.b }[14], [x20]\n"
+      "ld1 { v16.b }[14], [x19]\n"
+      "b 13f\n"
+      "6:"  // odd_loads_1_12
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[12], [x23]\n"
+      "ld1 { v18.b }[12], [x21]\n"
+      "ld1 { v17.b }[12], [x20]\n"
+      "ld1 { v16.b }[12], [x19]\n"
+      "b 13f\n"
+      "7:"  // odd_loads_2_8
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v19.h }[4], [x23], #0x2\n"
+      "ld1 { v18.h }[4], [x21], #0x2\n"
+      "ld1 { v17.h }[4], [x20], #0x2\n"
+      "ld1 { v16.h }[4], [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[10], [x23]\n"
+      "ld1 { v18.b }[10], [x21]\n"
+      "ld1 { v17.b }[10], [x20]\n"
+      "ld1 { v16.b }[10], [x19]\n"
+      "b 13f\n"
+      "8:"  // odd_loads_1_8
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[8], [x23]\n"
+      "ld1 { v18.b }[8], [x21]\n"
+      "ld1 { v17.b }[8], [x20]\n"
+      "ld1 { v16.b }[8], [x19]\n"
+      "b 13f\n"
+      "9:"  // odd_loads_4_0
+      "tbz %x[width], #2, 11f\n"
+      "ldr s19, [x23], #0x4\n"
+      "ldr s18, [x21], #0x4\n"
+      "ldr s17, [x20], #0x4\n"
+      "ldr s16, [x19], #0x4\n"
+      "tbz %x[width], #1, 10f\n"
+      "ld1 { v19.h }[2], [x23], #0x2\n"
+      "ld1 { v18.h }[2], [x21], #0x2\n"
+      "ld1 { v17.h }[2], [x20], #0x2\n"
+      "ld1 { v16.h }[2], [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[6], [x23]\n"
+      "ld1 { v18.b }[6], [x21]\n"
+      "ld1 { v17.b }[6], [x20]\n"
+      "ld1 { v16.b }[6], [x19]\n"
+      "b 13f\n"
+      "10:"  // odd_loads_1_4
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[4], [x23]\n"
+      "ld1 { v18.b }[4], [x21]\n"
+      "ld1 { v17.b }[4], [x20]\n"
+      "ld1 { v16.b }[4], [x19]\n"
+      "b 13f\n"
+      "11:"  // odd_loads_2_0
+      "tbz %x[width], #1, 12f\n"
+      "ldr h19, [x23], #0x2\n"
+      "ldr h18, [x21], #0x2\n"
+      "ldr h17, [x20], #0x2\n"
+      "ldr h16, [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[2], [x23]\n"
+      "ld1 { v18.b }[2], [x21]\n"
+      "ld1 { v17.b }[2], [x20]\n"
+      "ld1 { v16.b }[2], [x19]\n"
+      "b 13f\n"
+      "12:"  // odd_loads_1_0
+      "ldr b19, [x23, #0x0]\n"
+      "ldr b18, [x21, #0x0]\n"
+      "ldr b17, [x20, #0x0]\n"
+      "ldr b16, [x19, #0x0]\n"
+      "13:"  // Odd load end
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "sadalp v28.8h, v19.16b\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "sadalp v27.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "sadalp v26.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "sadalp v25.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "14:"  // Odds skip
+      "sadalp v24.4s, v28.8h\n"
+      "sadalp v23.4s, v27.8h\n"
+      "addp v24.4s, v24.4s, v23.4s\n"
+      "sadalp v22.4s, v26.8h\n"
+      "sadalp v21.4s, v25.8h\n"
+      "addp v23.4s, v22.4s, v21.4s\n"
+      "addp v24.4s, v24.4s, v23.4s\n"
+      "add v24.4s, v24.4s, v20.4s\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
new file mode 100644
index 0000000000..af3efb25b2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<4, 16, VLType::None, true>(
+  uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v28.8h, #0x0\n"
+      "ldr x23, [%x[in], #0x0]\n"
+      "mov x22, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "ldr x21, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x4\n"
+      "movi v26.8h, #0x0\n"
+      "ldr x20, [%x[in], #0x10]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "movi v25.8h, #0x0\n"
+      "ldr x19, [%x[in], #0x18]\n"
+      "movi v24.4s, #0x0\n"
+      "add x21, x21, %x[row_offset]\n"
+      "movi v23.4s, #0x0\n"
+      "add x20, x20, %x[row_offset]\n"
+      "movi v22.4s, #0x0\n"
+      "add x19, x19, %x[row_offset]\n"
+      "movi v21.4s, #0x0\n"
+      "beq 1f\n"
+      "mov x19, x23\n"
+      "cmp %x[height], #0x2\n"
+      "csel x21, x21, x23, GE\n"
+      "csel x20, x20, x23, GT\n"
+      "1:"  // no_pointer_adj
+      "movi v20.4s, #0x0\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x19, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "prfm pldl1keep, [x19, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x10\n"
+      "ld1 { v20.4s }, [%x[out_ptr]]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x10\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x22, #0x7e\n"
+      "ble 4f\n"
+      "uadalp v24.4s, v28.8h\n"
+      "movi v28.8h, #0x0\n"
+      "uadalp v23.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "uadalp v22.4s, v26.8h\n"
+      "movi v26.8h, #0x0\n"
+      "uadalp v21.4s, v25.8h\n"
+      "movi v25.8h, #0x0\n"
+      "mov x22, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q19, [x23], #0x10\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x21], #0x10\n"
+      "ldr q17, [x20], #0x10\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x19], #0x10\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "uadalp v28.8h, v19.16b\n"
+      "prfm pldl1keep, [x19, #0x70]\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "uadalp v27.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "uadalp v26.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "uadalp v25.8h, v16.16b\n"
+      "add x22, x22, #0x1\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "cmp %x[width], #0x10\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 14f\n"
+      "tbz %x[width], #3, 9f\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d18, [x21], #0x8\n"
+      "ldr d17, [x20], #0x8\n"
+      "ldr d16, [x19], #0x8\n"
+      "tbz %x[width], #2, 7f\n"
+      "ld1 { v19.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x21], #0x4\n"
+      "ld1 { v17.s }[2], [x20], #0x4\n"
+      "ld1 { v16.s }[2], [x19], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v19.h }[6], [x23], #0x2\n"
+      "ld1 { v18.h }[6], [x21], #0x2\n"
+      "ld1 { v17.h }[6], [x20], #0x2\n"
+      "ld1 { v16.h }[6], [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[14], [x23]\n"
+      "ld1 { v18.b }[14], [x21]\n"
+      "ld1 { v17.b }[14], [x20]\n"
+      "ld1 { v16.b }[14], [x19]\n"
+      "b 13f\n"
+      "6:"  // odd_loads_1_12
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[12], [x23]\n"
+      "ld1 { v18.b }[12], [x21]\n"
+      "ld1 { v17.b }[12], [x20]\n"
+      "ld1 { v16.b }[12], [x19]\n"
+      "b 13f\n"
+      "7:"  // odd_loads_2_8
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v19.h }[4], [x23], #0x2\n"
+      "ld1 { v18.h }[4], [x21], #0x2\n"
+      "ld1 { v17.h }[4], [x20], #0x2\n"
+      "ld1 { v16.h }[4], [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[10], [x23]\n"
+      "ld1 { v18.b }[10], [x21]\n"
+      "ld1 { v17.b }[10], [x20]\n"
+      "ld1 { v16.b }[10], [x19]\n"
+      "b 13f\n"
+      "8:"  // odd_loads_1_8
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[8], [x23]\n"
+      "ld1 { v18.b }[8], [x21]\n"
+      "ld1 { v17.b }[8], [x20]\n"
+      "ld1 { v16.b }[8], [x19]\n"
+      "b 13f\n"
+      "9:"  // odd_loads_4_0
+      "tbz %x[width], #2, 11f\n"
+      "ldr s19, [x23], #0x4\n"
+      "ldr s18, [x21], #0x4\n"
+      "ldr s17, [x20], #0x4\n"
+      "ldr s16, [x19], #0x4\n"
+      "tbz %x[width], #1, 10f\n"
+      "ld1 { v19.h }[2], [x23], #0x2\n"
+      "ld1 { v18.h }[2], [x21], #0x2\n"
+      "ld1 { v17.h }[2], [x20], #0x2\n"
+      "ld1 { v16.h }[2], [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[6], [x23]\n"
+      "ld1 { v18.b }[6], [x21]\n"
+      "ld1 { v17.b }[6], [x20]\n"
+      "ld1 { v16.b }[6], [x19]\n"
+      "b 13f\n"
+      "10:"  // odd_loads_1_4
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[4], [x23]\n"
+      "ld1 { v18.b }[4], [x21]\n"
+      "ld1 { v17.b }[4], [x20]\n"
+      "ld1 { v16.b }[4], [x19]\n"
+      "b 13f\n"
+      "11:"  // odd_loads_2_0
+      "tbz %x[width], #1, 12f\n"
+      "ldr h19, [x23], #0x2\n"
+      "ldr h18, [x21], #0x2\n"
+      "ldr h17, [x20], #0x2\n"
+      "ldr h16, [x19], #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v19.b }[2], [x23]\n"
+      "ld1 { v18.b }[2], [x21]\n"
+      "ld1 { v17.b }[2], [x20]\n"
+      "ld1 { v16.b }[2], [x19]\n"
+      "b 13f\n"
+      "12:"  // odd_loads_1_0
+      "ldr b19, [x23, #0x0]\n"
+      "ldr b18, [x21, #0x0]\n"
+      "ldr b17, [x20, #0x0]\n"
+      "ldr b16, [x19, #0x0]\n"
+      "13:"  // Odd load end
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "uadalp v28.8h, v19.16b\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "uadalp v27.8h, v18.16b\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "uadalp v26.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "uadalp v25.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "14:"  // Odds skip
+      "uadalp v24.4s, v28.8h\n"
+      "uadalp v23.4s, v27.8h\n"
+      "addp v24.4s, v24.4s, v23.4s\n"
+      "uadalp v22.4s, v26.8h\n"
+      "uadalp v21.4s, v25.8h\n"
+      "addp v23.4s, v22.4s, v21.4s\n"
+      "addp v24.4s, v24.4s, v23.4s\n"
+      "add v24.4s, v24.4s, v20.4s\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
new file mode 100644
index 0000000000..34d25f27b8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  float * &out_ptr, const bfloat16 * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "movi v29.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr d28, [x27], #0x8\n"
+      "zip1 v28.8h, v29.8h, v28.8h\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr d27, [x26], #0x8\n"
+      "zip1 v27.8h, v29.8h, v27.8h\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr d26, [x25], #0x8\n"
+      "zip1 v26.8h, v29.8h, v26.8h\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr d25, [x24], #0x8\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "zip1 v25.8h, v29.8h, v25.8h\n"
+      "ldr d24, [x23], #0x8\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "zip1 v24.8h, v29.8h, v24.8h\n"
+      "ldr d23, [x22], #0x8\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "zip1 v23.8h, v29.8h, v23.8h\n"
+      "ldr d22, [x21], #0x8\n"
+      "zip2 v19.4s, v20.4s, v19.4s\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "zip1 v22.8h, v29.8h, v22.8h\n"
+      "ldr d21, [x20], #0x8\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v21.8h, v29.8h, v21.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "subs %x[width], %x[width], #0x4\n"
+      "zip2 v20.4s, v28.4s, v26.4s\n"
+      "cmp %x[width], #0x4\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q19, [%x[out_ptr], #0x20]\n"
+      "zip2 v19.4s, v27.4s, v25.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 6f\n"
+      "tbz %x[width], #1, 4f\n"
+      "ldr s28, [x27], #0x4\n"
+      "ldr s27, [x26], #0x4\n"
+      "ldr s26, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s23, [x22], #0x4\n"
+      "ldr s22, [x21], #0x4\n"
+      "ldr s21, [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 5f\n"
+      "ld1 { v28.h }[2], [x27]\n"
+      "ld1 { v27.h }[2], [x26]\n"
+      "ld1 { v26.h }[2], [x25]\n"
+      "ld1 { v25.h }[2], [x24]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "ld1 { v23.h }[2], [x22]\n"
+      "ld1 { v22.h }[2], [x21]\n"
+      "ld1 { v21.h }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 5f\n"
+      "4:"  // odd_loads_1_0
+      "ldr h28, [x27, #0x0]\n"
+      "ldr h27, [x26, #0x0]\n"
+      "ldr h26, [x25, #0x0]\n"
+      "ldr h25, [x24, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "ldr h23, [x22, #0x0]\n"
+      "ldr h22, [x21, #0x0]\n"
+      "ldr h21, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "5:"  // Odd load end
+      "zip1 v28.8h, v29.8h, v28.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v27.8h, v29.8h, v27.8h\n"
+      "zip1 v26.8h, v29.8h, v26.8h\n"
+      "zip1 v25.8h, v29.8h, v25.8h\n"
+      "zip1 v24.8h, v29.8h, v24.8h\n"
+      "zip1 v23.8h, v29.8h, v23.8h\n"
+      "zip1 v22.8h, v29.8h, v22.8h\n"
+      "zip1 v21.8h, v29.8h, v21.8h\n"
+      "zip1 v20.4s, v28.4s, v26.4s\n"
+      "zip1 v19.4s, v27.4s, v25.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 6f\n"
+      "zip2 v19.4s, v20.4s, v19.4s\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q19, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 6f\n"
+      "zip2 v20.4s, v28.4s, v26.4s\n"
+      "zip2 v19.4s, v27.4s, v25.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v21.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "6:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
new file mode 100644
index 0000000000..d547957129
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  __fp16 * &out_ptr, const __fp16 * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q30, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q29, [x26], #0x10\n"
+      "ldr q28, [x25], #0x10\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q27, [x24], #0x10\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q24, [x23], #0x10\n"
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 8f\n"
+      "tbz %x[width], #2, 5f\n"
+      "ldr d30, [x27], #0x8\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d27, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v30.s }[2], [x27], #0x4\n"
+      "ld1 { v29.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x20], #0x4\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.h }[6], [x27]\n"
+      "ld1 { v29.h }[6], [x26]\n"
+      "ld1 { v28.h }[6], [x25]\n"
+      "ld1 { v27.h }[6], [x24]\n"
+      "ld1 { v24.h }[6], [x23]\n"
+      "ld1 { v25.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
+      "ld1 { v22.h }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 7f\n"
+      "4:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.h }[4], [x27]\n"
+      "ld1 { v29.h }[4], [x26]\n"
+      "ld1 { v28.h }[4], [x25]\n"
+      "ld1 { v27.h }[4], [x24]\n"
+      "ld1 { v24.h }[4], [x23]\n"
+      "ld1 { v25.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
+      "ld1 { v22.h }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 7f\n"
+      "5:"  // odd_loads_2_0
+      "tbz %x[width], #1, 6f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.h }[2], [x27]\n"
+      "ld1 { v29.h }[2], [x26]\n"
+      "ld1 { v28.h }[2], [x25]\n"
+      "ld1 { v27.h }[2], [x24]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "ld1 { v25.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
+      "ld1 { v22.h }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 7f\n"
+      "6:"  // odd_loads_1_0
+      "ldr h30, [x27, #0x0]\n"
+      "ldr h29, [x26, #0x0]\n"
+      "ldr h28, [x25, #0x0]\n"
+      "ldr h27, [x24, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "ldr h25, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
+      "ldr h22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "7:"  // Odd load end
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v20.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "8:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
new file mode 100644
index 0000000000..b45e622a47
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  float * &out_ptr, const __fp16 * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr d29, [x27], #0x8\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr d26, [x24], #0x8\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "fcvtl v29.4s, v29.4h\n"
+      "fcvtl v28.4s, v28.4h\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "fcvtl v27.4s, v27.4h\n"
+      "zip1 v20.4s, v29.4s, v27.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "fcvtl v26.4s, v26.4h\n"
+      "zip2 v18.4s, v29.4s, v27.4s\n"
+      "fcvtl v25.4s, v25.4h\n"
+      "fcvtl v24.4s, v24.4h\n"
+      "zip1 v19.4s, v28.4s, v26.4s\n"
+      "fcvtl v23.4s, v23.4h\n"
+      "zip2 v17.4s, v28.4s, v26.4s\n"
+      "fcvtl v22.4s, v22.4h\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v21.4s, v20.4s, v19.4s\n"
+      "subs %x[width], %x[width], #0x4\n"
+      "zip1 v20.4s, v18.4s, v17.4s\n"
+      "cmp %x[width], #0x4\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      "zip1 v18.4s, v25.4s, v23.4s\n"
+      "zip1 v17.4s, v24.4s, v22.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q21, [%x[out_ptr], #0x20]\n"
+      "zip2 v18.4s, v25.4s, v23.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v17.4s, v24.4s, v22.4s\n"
+      "str q20, [%x[out_ptr], #0x40]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q19, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 6f\n"
+      "tbz %x[width], #1, 4f\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s26, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 5f\n"
+      "ld1 { v29.h }[2], [x27]\n"
+      "ld1 { v28.h }[2], [x26]\n"
+      "ld1 { v27.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x24]\n"
+      "ld1 { v25.h }[2], [x23]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
+      "ld1 { v22.h }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 5f\n"
+      "4:"  // odd_loads_1_0
+      "ldr h29, [x27, #0x0]\n"
+      "ldr h28, [x26, #0x0]\n"
+      "ldr h27, [x25, #0x0]\n"
+      "ldr h26, [x24, #0x0]\n"
+      "ldr h25, [x23, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
+      "ldr h22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "5:"  // Odd load end
+      "fcvtl v29.4s, v29.4h\n"
+      "fcvtl v28.4s, v28.4h\n"
+      "fcvtl v27.4s, v27.4h\n"
+      "zip1 v20.4s, v29.4s, v27.4s\n"
+      "fcvtl v26.4s, v26.4h\n"
+      "fcvtl v25.4s, v25.4h\n"
+      "zip1 v19.4s, v28.4s, v26.4s\n"
+      "fcvtl v24.4s, v24.4h\n"
+      "fcvtl v23.4s, v23.4h\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      "fcvtl v22.4s, v22.4h\n"
+      "zip1 v18.4s, v25.4s, v23.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v17.4s, v24.4s, v22.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 6f\n"
+      "zip2 v21.4s, v20.4s, v19.4s\n"
+      "zip2 v16.4s, v18.4s, v17.4s\n"
+      "str q21, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 6f\n"
+      "zip2 v18.4s, v29.4s, v27.4s\n"
+      "zip2 v17.4s, v28.4s, v26.4s\n"
+      "zip1 v20.4s, v18.4s, v17.4s\n"
+      "str q20, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.4s, v25.4s, v23.4s\n"
+      "zip2 v17.4s, v24.4s, v22.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "6:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
new file mode 100644
index 0000000000..3f38859c1c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  float * &out_ptr, const float * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #2\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #2\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #2\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #2\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #2\n"
+      "add x22, x22, %x[row_offset], LSL #2\n"
+      "add x21, x21, %x[row_offset], LSL #2\n"
+      "add x20, x20, %x[row_offset], LSL #2\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q28, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q27, [x26], #0x10\n"
+      "ldr q26, [x25], #0x10\n"
+      "zip1 v23.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q22, [x24], #0x10\n"
+      "zip2 v26.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q25, [x23], #0x10\n"
+      "zip1 v20.4s, v27.4s, v22.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q24, [x22], #0x10\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v23.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip1 v18.4s, v25.4s, v19.4s\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.4s, v26.4s, v22.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v16.4s, v24.4s, v21.4s\n"
+      "subs %x[width], %x[width], #0x4\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "cmp %x[width], #0x4\n"
+      "zip2 v16.4s, v18.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "str q23, [%x[out_ptr], #0x20]\n"
+      "zip2 v18.4s, v24.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q20, [%x[out_ptr], #0x40]\n"
+      "zip2 v17.4s, v26.4s, v22.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 6f\n"
+      "tbz %x[width], #1, 4f\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d27, [x26], #0x8\n"
+      "ldr d26, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d21, [x20], #0x8\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 5f\n"
+      "ld1 { v28.s }[2], [x27]\n"
+      "ld1 { v27.s }[2], [x26]\n"
+      "ld1 { v26.s }[2], [x25]\n"
+      "ld1 { v22.s }[2], [x24]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v19.s }[2], [x21]\n"
+      "ld1 { v21.s }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 5f\n"
+      "4:"  // odd_loads_1_0
+      "ldr s28, [x27, #0x0]\n"
+      "ldr s27, [x26, #0x0]\n"
+      "ldr s26, [x25, #0x0]\n"
+      "ldr s22, [x24, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s19, [x21, #0x0]\n"
+      "ldr s21, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "5:"  // Odd load end
+      "zip1 v23.4s, v28.4s, v26.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v27.4s, v22.4s\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.4s, v25.4s, v19.4s\n"
+      "zip1 v16.4s, v24.4s, v21.4s\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 6f\n"
+      "zip2 v23.4s, v23.4s, v20.4s\n"
+      "zip2 v16.4s, v18.4s, v16.4s\n"
+      "str q23, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 6f\n"
+      "zip2 v26.4s, v28.4s, v26.4s\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "zip1 v20.4s, v26.4s, v22.4s\n"
+      "str q20, [%x[out_ptr], #0x0]\n"
+      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "zip2 v18.4s, v24.4s, v21.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "6:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
new file mode 100644
index 0000000000..03f552a575
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  int16_t * &out_ptr, const int16_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q30, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q29, [x26], #0x10\n"
+      "ldr q28, [x25], #0x10\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q27, [x24], #0x10\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q24, [x23], #0x10\n"
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 8f\n"
+      "tbz %x[width], #2, 5f\n"
+      "ldr d30, [x27], #0x8\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d27, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v30.s }[2], [x27], #0x4\n"
+      "ld1 { v29.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x20], #0x4\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.h }[6], [x27]\n"
+      "ld1 { v29.h }[6], [x26]\n"
+      "ld1 { v28.h }[6], [x25]\n"
+      "ld1 { v27.h }[6], [x24]\n"
+      "ld1 { v24.h }[6], [x23]\n"
+      "ld1 { v25.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
+      "ld1 { v22.h }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 7f\n"
+      "4:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.h }[4], [x27]\n"
+      "ld1 { v29.h }[4], [x26]\n"
+      "ld1 { v28.h }[4], [x25]\n"
+      "ld1 { v27.h }[4], [x24]\n"
+      "ld1 { v24.h }[4], [x23]\n"
+      "ld1 { v25.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
+      "ld1 { v22.h }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 7f\n"
+      "5:"  // odd_loads_2_0
+      "tbz %x[width], #1, 6f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.h }[2], [x27]\n"
+      "ld1 { v29.h }[2], [x26]\n"
+      "ld1 { v28.h }[2], [x25]\n"
+      "ld1 { v27.h }[2], [x24]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "ld1 { v25.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
+      "ld1 { v22.h }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 7f\n"
+      "6:"  // odd_loads_1_0
+      "ldr h30, [x27, #0x0]\n"
+      "ldr h29, [x26, #0x0]\n"
+      "ldr h28, [x25, #0x0]\n"
+      "ldr h27, [x24, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "ldr h25, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
+      "ldr h22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "7:"  // Odd load end
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v20.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "8:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  uint16_t * &out_ptr, const uint16_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  int16_t * &out_cast = reinterpret_cast<int16_t * &>(out_ptr);
+  const int16_t * const * in_cast = reinterpret_cast<const int16_t * const *>(in);
+
+  interleave_block<8, 1, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false);
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
new file mode 100644
index 0000000000..35c7719de7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, true>(
+  int16_t * &out_ptr, const int16_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v1.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v0.4s }, [%x[out_ptr]]\n"
+      "ldr q31, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x8\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0xe\n"
+      "ble 4f\n"
+      "saddw v0.4s, v0.4s, v1.4h\n"
+      "saddw2 v31.4s, v31.4s, v1.8h\n"
+      "mov x19, #0x0\n"
+      "movi v1.8h, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q30, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q29, [x26], #0x10\n"
+      "ldr q28, [x25], #0x10\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q27, [x24], #0x10\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q24, [x23], #0x10\n"
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "add x19, x19, #0x1\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v20.8h, v17.8h\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.8h, v21.8h, v19.8h\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 10f\n"
+      "tbz %x[width], #2, 7f\n"
+      "ldr d30, [x27], #0x8\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d27, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v30.s }[2], [x27], #0x4\n"
+      "ld1 { v29.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x20], #0x4\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.h }[6], [x27]\n"
+      "ld1 { v29.h }[6], [x26]\n"
+      "ld1 { v28.h }[6], [x25]\n"
+      "ld1 { v27.h }[6], [x24]\n"
+      "ld1 { v24.h }[6], [x23]\n"
+      "ld1 { v25.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
+      "ld1 { v22.h }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 9f\n"
+      "6:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.h }[4], [x27]\n"
+      "ld1 { v29.h }[4], [x26]\n"
+      "ld1 { v28.h }[4], [x25]\n"
+      "ld1 { v27.h }[4], [x24]\n"
+      "ld1 { v24.h }[4], [x23]\n"
+      "ld1 { v25.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
+      "ld1 { v22.h }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 9f\n"
+      "7:"  // odd_loads_2_0
+      "tbz %x[width], #1, 8f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.h }[2], [x27]\n"
+      "ld1 { v29.h }[2], [x26]\n"
+      "ld1 { v28.h }[2], [x25]\n"
+      "ld1 { v27.h }[2], [x24]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "ld1 { v25.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
+      "ld1 { v22.h }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 9f\n"
+      "8:"  // odd_loads_1_0
+      "ldr h30, [x27, #0x0]\n"
+      "ldr h29, [x26, #0x0]\n"
+      "ldr h28, [x25, #0x0]\n"
+      "ldr h27, [x24, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "ldr h25, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
+      "ldr h22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "9:"  // Odd load end
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v17.8h, v20.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip2 v16.8h, v21.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "10:"  // Odds skip
+      "saddw v0.4s, v0.4s, v1.4h\n"
+      "str q0, [%x[out_ptr], #0x0]\n"
+      "saddw2 v31.4s, v31.4s, v1.8h\n"
+      "str q31, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
new file mode 100644
index 0000000000..582836fe67
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  int16_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr d30, [x27], #0x8\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr d27, [x24], #0x8\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d25, [x20], #0x8\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "sshll v30.8h, v30.8b, #0x0\n"
+      "sshll v29.8h, v29.8b, #0x0\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "sshll v28.8h, v28.8b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "sshll v27.8h, v27.8b, #0x0\n"
+      "sshll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "sshll v21.8h, v21.8b, #0x0\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "sshll v26.8h, v26.8b, #0x0\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 8f\n"
+      "tbz %x[width], #2, 5f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s21, [x22], #0x4\n"
+      "ldr s26, [x21], #0x4\n"
+      "ldr s25, [x20], #0x4\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v30.h }[2], [x27], #0x2\n"
+      "ld1 { v29.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v27.h }[2], [x24], #0x2\n"
+      "ld1 { v23.h }[2], [x23], #0x2\n"
+      "ld1 { v21.h }[2], [x22], #0x2\n"
+      "ld1 { v26.h }[2], [x21], #0x2\n"
+      "ld1 { v25.h }[2], [x20], #0x2\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.b }[6], [x27]\n"
+      "ld1 { v29.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v27.b }[6], [x24]\n"
+      "ld1 { v23.b }[6], [x23]\n"
+      "ld1 { v21.b }[6], [x22]\n"
+      "ld1 { v26.b }[6], [x21]\n"
+      "ld1 { v25.b }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 7f\n"
+      "4:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.b }[4], [x27]\n"
+      "ld1 { v29.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v27.b }[4], [x24]\n"
+      "ld1 { v23.b }[4], [x23]\n"
+      "ld1 { v21.b }[4], [x22]\n"
+      "ld1 { v26.b }[4], [x21]\n"
+      "ld1 { v25.b }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 7f\n"
+      "5:"  // odd_loads_2_0
+      "tbz %x[width], #1, 6f\n"
+      "ldr h30, [x27], #0x2\n"
+      "ldr h29, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h27, [x24], #0x2\n"
+      "ldr h23, [x23], #0x2\n"
+      "ldr h21, [x22], #0x2\n"
+      "ldr h26, [x21], #0x2\n"
+      "ldr h25, [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.b }[2], [x27]\n"
+      "ld1 { v29.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v27.b }[2], [x24]\n"
+      "ld1 { v23.b }[2], [x23]\n"
+      "ld1 { v21.b }[2], [x22]\n"
+      "ld1 { v26.b }[2], [x21]\n"
+      "ld1 { v25.b }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 7f\n"
+      "6:"  // odd_loads_1_0
+      "ldr b30, [x27, #0x0]\n"
+      "ldr b29, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b27, [x24, #0x0]\n"
+      "ldr b23, [x23, #0x0]\n"
+      "ldr b21, [x22, #0x0]\n"
+      "ldr b26, [x21, #0x0]\n"
+      "ldr b25, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "7:"  // Odd load end
+      "sshll v30.8h, v30.8b, #0x0\n"
+      "sshll v29.8h, v29.8b, #0x0\n"
+      "sshll v28.8h, v28.8b, #0x0\n"
+      "sshll v27.8h, v27.8b, #0x0\n"
+      "sshll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "sshll v21.8h, v21.8b, #0x0\n"
+      "sshll v26.8h, v26.8b, #0x0\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "8:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
new file mode 100644
index 0000000000..35dc3dc0d4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, true>(
+  int16_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v1.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v0.4s }, [%x[out_ptr]]\n"
+      "ldr q31, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x8\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0xe\n"
+      "ble 4f\n"
+      "saddw v0.4s, v0.4s, v1.4h\n"
+      "saddw2 v31.4s, v31.4s, v1.8h\n"
+      "mov x19, #0x0\n"
+      "movi v1.8h, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr d30, [x27], #0x8\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr d27, [x24], #0x8\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d25, [x20], #0x8\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "sshll v30.8h, v30.8b, #0x0\n"
+      "sshll v29.8h, v29.8b, #0x0\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "sshll v28.8h, v28.8b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "sshll v27.8h, v27.8b, #0x0\n"
+      "sshll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "sshll v21.8h, v21.8b, #0x0\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "sshll v26.8h, v26.8b, #0x0\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "add x19, x19, #0x1\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 10f\n"
+      "tbz %x[width], #2, 7f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s21, [x22], #0x4\n"
+      "ldr s26, [x21], #0x4\n"
+      "ldr s25, [x20], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v30.h }[2], [x27], #0x2\n"
+      "ld1 { v29.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v27.h }[2], [x24], #0x2\n"
+      "ld1 { v23.h }[2], [x23], #0x2\n"
+      "ld1 { v21.h }[2], [x22], #0x2\n"
+      "ld1 { v26.h }[2], [x21], #0x2\n"
+      "ld1 { v25.h }[2], [x20], #0x2\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.b }[6], [x27]\n"
+      "ld1 { v29.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v27.b }[6], [x24]\n"
+      "ld1 { v23.b }[6], [x23]\n"
+      "ld1 { v21.b }[6], [x22]\n"
+      "ld1 { v26.b }[6], [x21]\n"
+      "ld1 { v25.b }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 9f\n"
+      "6:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.b }[4], [x27]\n"
+      "ld1 { v29.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v27.b }[4], [x24]\n"
+      "ld1 { v23.b }[4], [x23]\n"
+      "ld1 { v21.b }[4], [x22]\n"
+      "ld1 { v26.b }[4], [x21]\n"
+      "ld1 { v25.b }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 9f\n"
+      "7:"  // odd_loads_2_0
+      "tbz %x[width], #1, 8f\n"
+      "ldr h30, [x27], #0x2\n"
+      "ldr h29, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h27, [x24], #0x2\n"
+      "ldr h23, [x23], #0x2\n"
+      "ldr h21, [x22], #0x2\n"
+      "ldr h26, [x21], #0x2\n"
+      "ldr h25, [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.b }[2], [x27]\n"
+      "ld1 { v29.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v27.b }[2], [x24]\n"
+      "ld1 { v23.b }[2], [x23]\n"
+      "ld1 { v21.b }[2], [x22]\n"
+      "ld1 { v26.b }[2], [x21]\n"
+      "ld1 { v25.b }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 9f\n"
+      "8:"  // odd_loads_1_0
+      "ldr b30, [x27, #0x0]\n"
+      "ldr b29, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b27, [x24, #0x0]\n"
+      "ldr b23, [x23, #0x0]\n"
+      "ldr b21, [x22, #0x0]\n"
+      "ldr b26, [x21, #0x0]\n"
+      "ldr b25, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "9:"  // Odd load end
+      "sshll v30.8h, v30.8b, #0x0\n"
+      "sshll v29.8h, v29.8b, #0x0\n"
+      "sshll v28.8h, v28.8b, #0x0\n"
+      "sshll v27.8h, v27.8b, #0x0\n"
+      "sshll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "sshll v21.8h, v21.8b, #0x0\n"
+      "sshll v26.8h, v26.8b, #0x0\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "sshll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "10:"  // Odds skip
+      "saddw v0.4s, v0.4s, v1.4h\n"
+      "str q0, [%x[out_ptr], #0x0]\n"
+      "saddw2 v31.4s, v31.4s, v1.8h\n"
+      "str q31, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
new file mode 100644
index 0000000000..bfa8989a4d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, true>(
+  uint16_t * &out_ptr, const uint16_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v1.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v0.4s }, [%x[out_ptr]]\n"
+      "ldr q31, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x8\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0xe\n"
+      "ble 4f\n"
+      "uaddw v0.4s, v0.4s, v1.4h\n"
+      "uaddw2 v31.4s, v31.4s, v1.8h\n"
+      "mov x19, #0x0\n"
+      "movi v1.8h, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q30, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q29, [x26], #0x10\n"
+      "ldr q28, [x25], #0x10\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q27, [x24], #0x10\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q24, [x23], #0x10\n"
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "add x19, x19, #0x1\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v20.8h, v17.8h\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.8h, v21.8h, v19.8h\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "str q17, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 10f\n"
+      "tbz %x[width], #2, 7f\n"
+      "ldr d30, [x27], #0x8\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d27, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v30.s }[2], [x27], #0x4\n"
+      "ld1 { v29.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v23.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x20], #0x4\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.h }[6], [x27]\n"
+      "ld1 { v29.h }[6], [x26]\n"
+      "ld1 { v28.h }[6], [x25]\n"
+      "ld1 { v27.h }[6], [x24]\n"
+      "ld1 { v24.h }[6], [x23]\n"
+      "ld1 { v25.h }[6], [x22]\n"
+      "ld1 { v23.h }[6], [x21]\n"
+      "ld1 { v22.h }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 9f\n"
+      "6:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.h }[4], [x27]\n"
+      "ld1 { v29.h }[4], [x26]\n"
+      "ld1 { v28.h }[4], [x25]\n"
+      "ld1 { v27.h }[4], [x24]\n"
+      "ld1 { v24.h }[4], [x23]\n"
+      "ld1 { v25.h }[4], [x22]\n"
+      "ld1 { v23.h }[4], [x21]\n"
+      "ld1 { v22.h }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 9f\n"
+      "7:"  // odd_loads_2_0
+      "tbz %x[width], #1, 8f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s23, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.h }[2], [x27]\n"
+      "ld1 { v29.h }[2], [x26]\n"
+      "ld1 { v28.h }[2], [x25]\n"
+      "ld1 { v27.h }[2], [x24]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "ld1 { v25.h }[2], [x22]\n"
+      "ld1 { v23.h }[2], [x21]\n"
+      "ld1 { v22.h }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 9f\n"
+      "8:"  // odd_loads_1_0
+      "ldr h30, [x27, #0x0]\n"
+      "ldr h29, [x26, #0x0]\n"
+      "ldr h28, [x25, #0x0]\n"
+      "ldr h27, [x24, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "ldr h25, [x22, #0x0]\n"
+      "ldr h23, [x21, #0x0]\n"
+      "ldr h22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "9:"  // Odd load end
+      "zip1 v26.8h, v30.8h, v24.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v28.8h, v23.8h\n"
+      "zip1 v20.8h, v26.8h, v18.8h\n"
+      "zip1 v21.8h, v29.8h, v25.8h\n"
+      "zip1 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v17.8h, v20.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v26.8h, v18.8h\n"
+      "zip2 v16.8h, v21.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v17.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v24.8h, v30.8h, v24.8h\n"
+      "zip2 v21.8h, v28.8h, v23.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v20.8h, v29.8h, v25.8h\n"
+      "zip2 v19.8h, v27.8h, v22.8h\n"
+      "zip1 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v24.8h, v21.8h\n"
+      "zip2 v17.8h, v20.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "10:"  // Odds skip
+      "uaddw v0.4s, v0.4s, v1.4h\n"
+      "str q0, [%x[out_ptr], #0x0]\n"
+      "uaddw2 v31.4s, v31.4s, v1.8h\n"
+      "str q31, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
new file mode 100644
index 0000000000..86b90f1898
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+  uint16_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr d30, [x27], #0x8\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr d27, [x24], #0x8\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d25, [x20], #0x8\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ushll v30.8h, v30.8b, #0x0\n"
+      "ushll v29.8h, v29.8b, #0x0\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ushll v28.8h, v28.8b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "ushll v27.8h, v27.8b, #0x0\n"
+      "ushll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "ushll v21.8h, v21.8b, #0x0\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "ushll v26.8h, v26.8b, #0x0\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 8f\n"
+      "tbz %x[width], #2, 5f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s21, [x22], #0x4\n"
+      "ldr s26, [x21], #0x4\n"
+      "ldr s25, [x20], #0x4\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v30.h }[2], [x27], #0x2\n"
+      "ld1 { v29.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v27.h }[2], [x24], #0x2\n"
+      "ld1 { v23.h }[2], [x23], #0x2\n"
+      "ld1 { v21.h }[2], [x22], #0x2\n"
+      "ld1 { v26.h }[2], [x21], #0x2\n"
+      "ld1 { v25.h }[2], [x20], #0x2\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.b }[6], [x27]\n"
+      "ld1 { v29.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v27.b }[6], [x24]\n"
+      "ld1 { v23.b }[6], [x23]\n"
+      "ld1 { v21.b }[6], [x22]\n"
+      "ld1 { v26.b }[6], [x21]\n"
+      "ld1 { v25.b }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 7f\n"
+      "4:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.b }[4], [x27]\n"
+      "ld1 { v29.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v27.b }[4], [x24]\n"
+      "ld1 { v23.b }[4], [x23]\n"
+      "ld1 { v21.b }[4], [x22]\n"
+      "ld1 { v26.b }[4], [x21]\n"
+      "ld1 { v25.b }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 7f\n"
+      "5:"  // odd_loads_2_0
+      "tbz %x[width], #1, 6f\n"
+      "ldr h30, [x27], #0x2\n"
+      "ldr h29, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h27, [x24], #0x2\n"
+      "ldr h23, [x23], #0x2\n"
+      "ldr h21, [x22], #0x2\n"
+      "ldr h26, [x21], #0x2\n"
+      "ldr h25, [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v30.b }[2], [x27]\n"
+      "ld1 { v29.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v27.b }[2], [x24]\n"
+      "ld1 { v23.b }[2], [x23]\n"
+      "ld1 { v21.b }[2], [x22]\n"
+      "ld1 { v26.b }[2], [x21]\n"
+      "ld1 { v25.b }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 7f\n"
+      "6:"  // odd_loads_1_0
+      "ldr b30, [x27, #0x0]\n"
+      "ldr b29, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b27, [x24, #0x0]\n"
+      "ldr b23, [x23, #0x0]\n"
+      "ldr b21, [x22, #0x0]\n"
+      "ldr b26, [x21, #0x0]\n"
+      "ldr b25, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "7:"  // Odd load end
+      "ushll v30.8h, v30.8b, #0x0\n"
+      "ushll v29.8h, v29.8b, #0x0\n"
+      "ushll v28.8h, v28.8b, #0x0\n"
+      "ushll v27.8h, v27.8b, #0x0\n"
+      "ushll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "ushll v21.8h, v21.8b, #0x0\n"
+      "ushll v26.8h, v26.8b, #0x0\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 8f\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "8:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
new file mode 100644
index 0000000000..cefb70c57b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, true>(
+  uint16_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v1.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v0.4s }, [%x[out_ptr]]\n"
+      "ldr q31, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x8\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0xe\n"
+      "ble 4f\n"
+      "uaddw v0.4s, v0.4s, v1.4h\n"
+      "uaddw2 v31.4s, v31.4s, v1.8h\n"
+      "mov x19, #0x0\n"
+      "movi v1.8h, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr d30, [x27], #0x8\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr d29, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr d27, [x24], #0x8\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr d23, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d25, [x20], #0x8\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ushll v30.8h, v30.8b, #0x0\n"
+      "ushll v29.8h, v29.8b, #0x0\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ushll v28.8h, v28.8b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "ushll v27.8h, v27.8b, #0x0\n"
+      "ushll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "ushll v21.8h, v21.8b, #0x0\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "ushll v26.8h, v26.8b, #0x0\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "add x19, x19, #0x1\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "cmp %x[width], #0x8\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x20]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x40]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x60]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 10f\n"
+      "tbz %x[width], #2, 7f\n"
+      "ldr s30, [x27], #0x4\n"
+      "ldr s29, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s27, [x24], #0x4\n"
+      "ldr s23, [x23], #0x4\n"
+      "ldr s21, [x22], #0x4\n"
+      "ldr s26, [x21], #0x4\n"
+      "ldr s25, [x20], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v30.h }[2], [x27], #0x2\n"
+      "ld1 { v29.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v27.h }[2], [x24], #0x2\n"
+      "ld1 { v23.h }[2], [x23], #0x2\n"
+      "ld1 { v21.h }[2], [x22], #0x2\n"
+      "ld1 { v26.h }[2], [x21], #0x2\n"
+      "ld1 { v25.h }[2], [x20], #0x2\n"
+      "mov x19, #0x6\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.b }[6], [x27]\n"
+      "ld1 { v29.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v27.b }[6], [x24]\n"
+      "ld1 { v23.b }[6], [x23]\n"
+      "ld1 { v21.b }[6], [x22]\n"
+      "ld1 { v26.b }[6], [x21]\n"
+      "ld1 { v25.b }[6], [x20]\n"
+      "mov x19, #0x7\n"
+      "b 9f\n"
+      "6:"  // odd_loads_1_4
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.b }[4], [x27]\n"
+      "ld1 { v29.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v27.b }[4], [x24]\n"
+      "ld1 { v23.b }[4], [x23]\n"
+      "ld1 { v21.b }[4], [x22]\n"
+      "ld1 { v26.b }[4], [x21]\n"
+      "ld1 { v25.b }[4], [x20]\n"
+      "mov x19, #0x5\n"
+      "b 9f\n"
+      "7:"  // odd_loads_2_0
+      "tbz %x[width], #1, 8f\n"
+      "ldr h30, [x27], #0x2\n"
+      "ldr h29, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h27, [x24], #0x2\n"
+      "ldr h23, [x23], #0x2\n"
+      "ldr h21, [x22], #0x2\n"
+      "ldr h26, [x21], #0x2\n"
+      "ldr h25, [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 9f\n"
+      "ld1 { v30.b }[2], [x27]\n"
+      "ld1 { v29.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v27.b }[2], [x24]\n"
+      "ld1 { v23.b }[2], [x23]\n"
+      "ld1 { v21.b }[2], [x22]\n"
+      "ld1 { v26.b }[2], [x21]\n"
+      "ld1 { v25.b }[2], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 9f\n"
+      "8:"  // odd_loads_1_0
+      "ldr b30, [x27, #0x0]\n"
+      "ldr b29, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b27, [x24, #0x0]\n"
+      "ldr b23, [x23, #0x0]\n"
+      "ldr b21, [x22, #0x0]\n"
+      "ldr b26, [x21, #0x0]\n"
+      "ldr b25, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "9:"  // Odd load end
+      "ushll v30.8h, v30.8b, #0x0\n"
+      "ushll v29.8h, v29.8b, #0x0\n"
+      "ushll v28.8h, v28.8b, #0x0\n"
+      "ushll v27.8h, v27.8b, #0x0\n"
+      "ushll v23.8h, v23.8b, #0x0\n"
+      "zip1 v24.8h, v30.8h, v23.8h\n"
+      "ushll v21.8h, v21.8b, #0x0\n"
+      "ushll v26.8h, v26.8b, #0x0\n"
+      "zip1 v20.8h, v28.8h, v26.8h\n"
+      "ushll v25.8h, v25.8b, #0x0\n"
+      "zip1 v22.8h, v29.8h, v21.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v24.8h, v20.8h\n"
+      "zip1 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v24.8h, v20.8h\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v23.8h, v30.8h, v23.8h\n"
+      "zip2 v20.8h, v28.8h, v26.8h\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v21.8h, v29.8h, v21.8h\n"
+      "zip2 v19.8h, v27.8h, v25.8h\n"
+      "zip1 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "subs x19, x19, #0x1\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "beq 10f\n"
+      "zip2 v18.8h, v23.8h, v20.8h\n"
+      "zip2 v17.8h, v21.8h, v19.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "add v1.8h, v1.8h, v16.8h\n"
+      "add %x[out_ptr], %x[out_ptr], #0x10\n"
+      "10:"  // Odds skip
+      "uaddw v0.4s, v0.4s, v1.4h\n"
+      "str q0, [%x[out_ptr], #0x0]\n"
+      "uaddw2 v31.4s, v31.4s, v1.8h\n"
+      "str q31, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
new file mode 100644
index 0000000000..5377edc1e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 2, VLType::None, false>(
+  bfloat16 * &out_ptr, const bfloat16 * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q28, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q27, [x26], #0x10\n"
+      "ldr q26, [x25], #0x10\n"
+      "zip1 v23.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q22, [x24], #0x10\n"
+      "zip2 v26.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q25, [x23], #0x10\n"
+      "zip1 v20.4s, v27.4s, v22.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q24, [x22], #0x10\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v23.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip1 v18.4s, v25.4s, v19.4s\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.4s, v26.4s, v22.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v16.4s, v24.4s, v21.4s\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "cmp %x[width], #0x8\n"
+      "zip2 v16.4s, v18.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "str q23, [%x[out_ptr], #0x20]\n"
+      "zip2 v18.4s, v24.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q20, [%x[out_ptr], #0x40]\n"
+      "zip2 v17.4s, v26.4s, v22.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 8f\n"
+      "tbz %x[width], #2, 5f\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d27, [x26], #0x8\n"
+      "ldr d26, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d21, [x20], #0x8\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v28.s }[2], [x27], #0x4\n"
+      "ld1 { v27.s }[2], [x26], #0x4\n"
+      "ld1 { v26.s }[2], [x25], #0x4\n"
+      "ld1 { v22.s }[2], [x24], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v21.s }[2], [x20], #0x4\n"
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v28.h }[6], [x27]\n"
+      "ld1 { v27.h }[6], [x26]\n"
+      "ld1 { v26.h }[6], [x25]\n"
+      "ld1 { v22.h }[6], [x24]\n"
+      "ld1 { v25.h }[6], [x23]\n"
+      "ld1 { v24.h }[6], [x22]\n"
+      "ld1 { v19.h }[6], [x21]\n"
+      "ld1 { v21.h }[6], [x20]\n"
+      "mov x19, #0x4\n"
+      "b 7f\n"
+      "4:"  // odd_loads_1_4
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v28.h }[4], [x27]\n"
+      "ld1 { v27.h }[4], [x26]\n"
+      "ld1 { v26.h }[4], [x25]\n"
+      "ld1 { v22.h }[4], [x24]\n"
+      "ld1 { v25.h }[4], [x23]\n"
+      "ld1 { v24.h }[4], [x22]\n"
+      "ld1 { v19.h }[4], [x21]\n"
+      "ld1 { v21.h }[4], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 7f\n"
+      "5:"  // odd_loads_2_0
+      "tbz %x[width], #1, 6f\n"
+      "ldr s28, [x27], #0x4\n"
+      "ldr s27, [x26], #0x4\n"
+      "ldr s26, [x25], #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s21, [x20], #0x4\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v28.h }[2], [x27]\n"
+      "ld1 { v27.h }[2], [x26]\n"
+      "ld1 { v26.h }[2], [x25]\n"
+      "ld1 { v22.h }[2], [x24]\n"
+      "ld1 { v25.h }[2], [x23]\n"
+      "ld1 { v24.h }[2], [x22]\n"
+      "ld1 { v19.h }[2], [x21]\n"
+      "ld1 { v21.h }[2], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 7f\n"
+      "6:"  // odd_loads_1_0
+      "ldr h28, [x27, #0x0]\n"
+      "ldr h27, [x26, #0x0]\n"
+      "ldr h26, [x25, #0x0]\n"
+      "ldr h22, [x24, #0x0]\n"
+      "ldr h25, [x23, #0x0]\n"
+      "ldr h24, [x22, #0x0]\n"
+      "ldr h19, [x21, #0x0]\n"
+      "ldr h21, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "7:"  // Odd load end
+      "zip1 v23.4s, v28.4s, v26.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v27.4s, v22.4s\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.4s, v25.4s, v19.4s\n"
+      "zip1 v16.4s, v24.4s, v21.4s\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 8f\n"
+      "zip2 v23.4s, v23.4s, v20.4s\n"
+      "zip2 v16.4s, v18.4s, v16.4s\n"
+      "str q23, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 8f\n"
+      "zip2 v26.4s, v28.4s, v26.4s\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v26.4s, v22.4s\n"
+      "str q20, [%x[out_ptr], #0x0]\n"
+      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "zip2 v18.4s, v24.4s, v21.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 8f\n"
+      "zip2 v17.4s, v26.4s, v22.4s\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "8:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
new file mode 100644
index 0000000000..3aea6a8999
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 2, VLType::None, false>(
+  float * &out_ptr, const float * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #2\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #2\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #2\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #2\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #2\n"
+      "add x22, x22, %x[row_offset], LSL #2\n"
+      "add x21, x21, %x[row_offset], LSL #2\n"
+      "add x20, x20, %x[row_offset], LSL #2\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q27, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q25, [x25], #0x10\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "str q24, [%x[out_ptr], #0x40]\n"
+      "str q21, [%x[out_ptr], #0x50]\n"
+      "str q18, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "subs %x[width], %x[width], #0x4\n"
+      "cmp %x[width], #0x4\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 6f\n"
+      "tbz %x[width], #1, 4f\n"
+      "ldr d27, [x27], #0x8\n"
+      "ldr d24, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 5f\n"
+      "ld1 { v27.s }[2], [x27]\n"
+      "ld1 { v24.s }[2], [x26]\n"
+      "ld1 { v25.s }[2], [x25]\n"
+      "ld1 { v21.s }[2], [x24]\n"
+      "ld1 { v22.s }[2], [x23]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "ld1 { v19.s }[2], [x21]\n"
+      "ld1 { v16.s }[2], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 5f\n"
+      "4:"  // odd_loads_1_0
+      "ldr s27, [x27, #0x0]\n"
+      "ldr s24, [x26, #0x0]\n"
+      "ldr s25, [x25, #0x0]\n"
+      "ldr s21, [x24, #0x0]\n"
+      "ldr s22, [x23, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "ldr s19, [x21, #0x0]\n"
+      "ldr s16, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "5:"  // Odd load end
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "beq 6f\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "str q21, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q18, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "6:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
new file mode 100644
index 0000000000..4780b77a4a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, false>(
+  bfloat16 * &out_ptr, const bfloat16 * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #1\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #1\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #1\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #1\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #1\n"
+      "add x22, x22, %x[row_offset], LSL #1\n"
+      "add x21, x21, %x[row_offset], LSL #1\n"
+      "add x20, x20, %x[row_offset], LSL #1\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x8\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q27, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q25, [x25], #0x10\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "str q24, [%x[out_ptr], #0x40]\n"
+      "str q21, [%x[out_ptr], #0x50]\n"
+      "str q18, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "subs %x[width], %x[width], #0x8\n"
+      "cmp %x[width], #0x8\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 8f\n"
+      "tbz %x[width], #2, 5f\n"
+      "ldr d27, [x27], #0x8\n"
+      "ldr d24, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v27.s }[2], [x27], #0x4\n"
+      "ld1 { v24.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v27.h }[6], [x27]\n"
+      "ld1 { v24.h }[6], [x26]\n"
+      "ld1 { v25.h }[6], [x25]\n"
+      "ld1 { v21.h }[6], [x24]\n"
+      "ld1 { v22.h }[6], [x23]\n"
+      "ld1 { v18.h }[6], [x22]\n"
+      "ld1 { v19.h }[6], [x21]\n"
+      "ld1 { v16.h }[6], [x20]\n"
+      "b 7f\n"
+      "4:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v27.h }[4], [x27]\n"
+      "ld1 { v24.h }[4], [x26]\n"
+      "ld1 { v25.h }[4], [x25]\n"
+      "ld1 { v21.h }[4], [x24]\n"
+      "ld1 { v22.h }[4], [x23]\n"
+      "ld1 { v18.h }[4], [x22]\n"
+      "ld1 { v19.h }[4], [x21]\n"
+      "ld1 { v16.h }[4], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 7f\n"
+      "5:"  // odd_loads_2_0
+      "tbz %x[width], #1, 6f\n"
+      "ldr s27, [x27], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s18, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 7f\n"
+      "ld1 { v27.h }[2], [x27]\n"
+      "ld1 { v24.h }[2], [x26]\n"
+      "ld1 { v25.h }[2], [x25]\n"
+      "ld1 { v21.h }[2], [x24]\n"
+      "ld1 { v22.h }[2], [x23]\n"
+      "ld1 { v18.h }[2], [x22]\n"
+      "ld1 { v19.h }[2], [x21]\n"
+      "ld1 { v16.h }[2], [x20]\n"
+      "b 7f\n"
+      "6:"  // odd_loads_1_0
+      "ldr h27, [x27, #0x0]\n"
+      "ldr h24, [x26, #0x0]\n"
+      "ldr h25, [x25, #0x0]\n"
+      "ldr h21, [x24, #0x0]\n"
+      "ldr h22, [x23, #0x0]\n"
+      "ldr h18, [x22, #0x0]\n"
+      "ldr h19, [x21, #0x0]\n"
+      "ldr h16, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "7:"  // Odd load end
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "beq 8f\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "str q21, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q18, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "8:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
new file mode 100644
index 0000000000..a9034f5742
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, false>(
+  int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x10\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q28, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q27, [x26], #0x10\n"
+      "ldr q26, [x25], #0x10\n"
+      "zip1 v23.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q22, [x24], #0x10\n"
+      "zip2 v26.4s, v28.4s, v26.4s\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q25, [x23], #0x10\n"
+      "zip1 v20.4s, v27.4s, v22.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q24, [x22], #0x10\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v23.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip1 v18.4s, v25.4s, v19.4s\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.4s, v26.4s, v22.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v16.4s, v24.4s, v21.4s\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "cmp %x[width], #0x10\n"
+      "zip2 v16.4s, v18.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "str q23, [%x[out_ptr], #0x20]\n"
+      "zip2 v18.4s, v24.4s, v21.4s\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q20, [%x[out_ptr], #0x40]\n"
+      "zip2 v17.4s, v26.4s, v22.4s\n"
+      "str q16, [%x[out_ptr], #0x50]\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 12f\n"
+      "tbz %x[width], #3, 7f\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d27, [x26], #0x8\n"
+      "ldr d26, [x25], #0x8\n"
+      "ldr d22, [x24], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d21, [x20], #0x8\n"
+      "tbz %x[width], #2, 5f\n"
+      "ld1 { v28.s }[2], [x27], #0x4\n"
+      "ld1 { v27.s }[2], [x26], #0x4\n"
+      "ld1 { v26.s }[2], [x25], #0x4\n"
+      "ld1 { v22.s }[2], [x24], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
+      "ld1 { v24.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v21.s }[2], [x20], #0x4\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v28.h }[6], [x27], #0x2\n"
+      "ld1 { v27.h }[6], [x26], #0x2\n"
+      "ld1 { v26.h }[6], [x25], #0x2\n"
+      "ld1 { v22.h }[6], [x24], #0x2\n"
+      "ld1 { v25.h }[6], [x23], #0x2\n"
+      "ld1 { v24.h }[6], [x22], #0x2\n"
+      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v21.h }[6], [x20], #0x2\n"
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[14], [x27]\n"
+      "ld1 { v27.b }[14], [x26]\n"
+      "ld1 { v26.b }[14], [x25]\n"
+      "ld1 { v22.b }[14], [x24]\n"
+      "ld1 { v25.b }[14], [x23]\n"
+      "ld1 { v24.b }[14], [x22]\n"
+      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v21.b }[14], [x20]\n"
+      "b 11f\n"
+      "4:"  // odd_loads_1_12
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[12], [x27]\n"
+      "ld1 { v27.b }[12], [x26]\n"
+      "ld1 { v26.b }[12], [x25]\n"
+      "ld1 { v22.b }[12], [x24]\n"
+      "ld1 { v25.b }[12], [x23]\n"
+      "ld1 { v24.b }[12], [x22]\n"
+      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v21.b }[12], [x20]\n"
+      "mov x19, #0x4\n"
+      "b 11f\n"
+      "5:"  // odd_loads_2_8
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v28.h }[4], [x27], #0x2\n"
+      "ld1 { v27.h }[4], [x26], #0x2\n"
+      "ld1 { v26.h }[4], [x25], #0x2\n"
+      "ld1 { v22.h }[4], [x24], #0x2\n"
+      "ld1 { v25.h }[4], [x23], #0x2\n"
+      "ld1 { v24.h }[4], [x22], #0x2\n"
+      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v21.h }[4], [x20], #0x2\n"
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[10], [x27]\n"
+      "ld1 { v27.b }[10], [x26]\n"
+      "ld1 { v26.b }[10], [x25]\n"
+      "ld1 { v22.b }[10], [x24]\n"
+      "ld1 { v25.b }[10], [x23]\n"
+      "ld1 { v24.b }[10], [x22]\n"
+      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v21.b }[10], [x20]\n"
+      "b 11f\n"
+      "6:"  // odd_loads_1_8
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[8], [x27]\n"
+      "ld1 { v27.b }[8], [x26]\n"
+      "ld1 { v26.b }[8], [x25]\n"
+      "ld1 { v22.b }[8], [x24]\n"
+      "ld1 { v25.b }[8], [x23]\n"
+      "ld1 { v24.b }[8], [x22]\n"
+      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v21.b }[8], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 11f\n"
+      "7:"  // odd_loads_4_0
+      "tbz %x[width], #2, 9f\n"
+      "ldr s28, [x27], #0x4\n"
+      "ldr s27, [x26], #0x4\n"
+      "ldr s26, [x25], #0x4\n"
+      "ldr s22, [x24], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s24, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s21, [x20], #0x4\n"
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v28.h }[2], [x27], #0x2\n"
+      "ld1 { v27.h }[2], [x26], #0x2\n"
+      "ld1 { v26.h }[2], [x25], #0x2\n"
+      "ld1 { v22.h }[2], [x24], #0x2\n"
+      "ld1 { v25.h }[2], [x23], #0x2\n"
+      "ld1 { v24.h }[2], [x22], #0x2\n"
+      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v21.h }[2], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[6], [x27]\n"
+      "ld1 { v27.b }[6], [x26]\n"
+      "ld1 { v26.b }[6], [x25]\n"
+      "ld1 { v22.b }[6], [x24]\n"
+      "ld1 { v25.b }[6], [x23]\n"
+      "ld1 { v24.b }[6], [x22]\n"
+      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v21.b }[6], [x20]\n"
+      "b 11f\n"
+      "8:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[4], [x27]\n"
+      "ld1 { v27.b }[4], [x26]\n"
+      "ld1 { v26.b }[4], [x25]\n"
+      "ld1 { v22.b }[4], [x24]\n"
+      "ld1 { v25.b }[4], [x23]\n"
+      "ld1 { v24.b }[4], [x22]\n"
+      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v21.b }[4], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 11f\n"
+      "9:"  // odd_loads_2_0
+      "tbz %x[width], #1, 10f\n"
+      "ldr h28, [x27], #0x2\n"
+      "ldr h27, [x26], #0x2\n"
+      "ldr h26, [x25], #0x2\n"
+      "ldr h22, [x24], #0x2\n"
+      "ldr h25, [x23], #0x2\n"
+      "ldr h24, [x22], #0x2\n"
+      "ldr h19, [x21], #0x2\n"
+      "ldr h21, [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v28.b }[2], [x27]\n"
+      "ld1 { v27.b }[2], [x26]\n"
+      "ld1 { v26.b }[2], [x25]\n"
+      "ld1 { v22.b }[2], [x24]\n"
+      "ld1 { v25.b }[2], [x23]\n"
+      "ld1 { v24.b }[2], [x22]\n"
+      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v21.b }[2], [x20]\n"
+      "b 11f\n"
+      "10:"  // odd_loads_1_0
+      "ldr b28, [x27, #0x0]\n"
+      "ldr b27, [x26, #0x0]\n"
+      "ldr b26, [x25, #0x0]\n"
+      "ldr b22, [x24, #0x0]\n"
+      "ldr b25, [x23, #0x0]\n"
+      "ldr b24, [x22, #0x0]\n"
+      "ldr b19, [x21, #0x0]\n"
+      "ldr b21, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "11:"  // Odd load end
+      "zip1 v23.4s, v28.4s, v26.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v27.4s, v22.4s\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.4s, v25.4s, v19.4s\n"
+      "zip1 v16.4s, v24.4s, v21.4s\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 12f\n"
+      "zip2 v23.4s, v23.4s, v20.4s\n"
+      "zip2 v16.4s, v18.4s, v16.4s\n"
+      "str q23, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 12f\n"
+      "zip2 v26.4s, v28.4s, v26.4s\n"
+      "zip2 v22.4s, v27.4s, v22.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v26.4s, v22.4s\n"
+      "str q20, [%x[out_ptr], #0x0]\n"
+      "zip2 v19.4s, v25.4s, v19.4s\n"
+      "zip2 v18.4s, v24.4s, v21.4s\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 12f\n"
+      "zip2 v17.4s, v26.4s, v22.4s\n"
+      "zip2 v16.4s, v19.4s, v18.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "12:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+template<>
+void interleave_block<8, 4, VLType::None, false>(
+  uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  int8_t * &out_cast = reinterpret_cast<int8_t * &>(out_ptr);
+  const int8_t * const * in_cast = reinterpret_cast<const int8_t * const *>(in);
+
+  interleave_block<8, 4, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false);
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
new file mode 100644
index 0000000000..2831cb79a6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, true>(
+  int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v1.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v0.8h, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "movi v30.4s, #0x0\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v31.4s }, [%x[out_ptr]]\n"
+      "ldr q30, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x10\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0x1e\n"
+      "ble 4f\n"
+      "sadalp v31.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "sadalp v30.4s, v0.8h\n"
+      "movi v0.8h, #0x0\n"
+      "mov x19, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q29, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q28, [x26], #0x10\n"
+      "ldr q27, [x25], #0x10\n"
+      "zip1 v23.4s, v29.4s, v27.4s\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip2 v27.4s, v29.4s, v27.4s\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q26, [x23], #0x10\n"
+      "zip1 v20.4s, v28.4s, v21.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v24.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "zip2 v23.4s, v28.4s, v21.4s\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.4s, v26.4s, v19.4s\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v21.4s, v27.4s, v23.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v17.4s, v25.4s, v22.4s\n"
+      "sadalp v1.8h, v16.16b\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "add x19, x19, #0x1\n"
+      "zip2 v20.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.4s, v26.4s, v19.4s\n"
+      "sadalp v0.8h, v16.16b\n"
+      "zip2 v16.4s, v25.4s, v22.4s\n"
+      "str q24, [%x[out_ptr], #0x20]\n"
+      "zip1 v18.4s, v19.4s, v16.4s\n"
+      "sadalp v1.8h, v24.16b\n"
+      "zip2 v17.4s, v27.4s, v23.4s\n"
+      "str q20, [%x[out_ptr], #0x30]\n"
+      "zip2 v16.4s, v19.4s, v16.4s\n"
+      "str q21, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "sadalp v0.8h, v20.16b\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
+      "sadalp v1.8h, v21.16b\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "sadalp v0.8h, v18.16b\n"
+      "cmp %x[width], #0x10\n"
+      "sadalp v1.8h, v17.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "sadalp v0.8h, v16.16b\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 14f\n"
+      "tbz %x[width], #3, 9f\n"
+      "ldr d29, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "tbz %x[width], #2, 7f\n"
+      "ld1 { v29.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v27.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v26.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x20], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v29.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "ld1 { v27.h }[6], [x25], #0x2\n"
+      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v26.h }[6], [x23], #0x2\n"
+      "ld1 { v25.h }[6], [x22], #0x2\n"
+      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v22.h }[6], [x20], #0x2\n"
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
+      "ld1 { v27.b }[14], [x25]\n"
+      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v26.b }[14], [x23]\n"
+      "ld1 { v25.b }[14], [x22]\n"
+      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v22.b }[14], [x20]\n"
+      "b 13f\n"
+      "6:"  // odd_loads_1_12
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
+      "ld1 { v27.b }[12], [x25]\n"
+      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v26.b }[12], [x23]\n"
+      "ld1 { v25.b }[12], [x22]\n"
+      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v22.b }[12], [x20]\n"
+      "mov x19, #0x4\n"
+      "b 13f\n"
+      "7:"  // odd_loads_2_8
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v29.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "ld1 { v27.h }[4], [x25], #0x2\n"
+      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v26.h }[4], [x23], #0x2\n"
+      "ld1 { v25.h }[4], [x22], #0x2\n"
+      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v22.h }[4], [x20], #0x2\n"
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
+      "ld1 { v27.b }[10], [x25]\n"
+      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v26.b }[10], [x23]\n"
+      "ld1 { v25.b }[10], [x22]\n"
+      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v22.b }[10], [x20]\n"
+      "b 13f\n"
+      "8:"  // odd_loads_1_8
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
+      "ld1 { v27.b }[8], [x25]\n"
+      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v26.b }[8], [x23]\n"
+      "ld1 { v25.b }[8], [x22]\n"
+      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v22.b }[8], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 13f\n"
+      "9:"  // odd_loads_4_0
+      "tbz %x[width], #2, 11f\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "tbz %x[width], #1, 10f\n"
+      "ld1 { v29.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v27.h }[2], [x25], #0x2\n"
+      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v26.h }[2], [x23], #0x2\n"
+      "ld1 { v25.h }[2], [x22], #0x2\n"
+      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v22.h }[2], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v27.b }[6], [x25]\n"
+      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v26.b }[6], [x23]\n"
+      "ld1 { v25.b }[6], [x22]\n"
+      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v22.b }[6], [x20]\n"
+      "b 13f\n"
+      "10:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v27.b }[4], [x25]\n"
+      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v26.b }[4], [x23]\n"
+      "ld1 { v25.b }[4], [x22]\n"
+      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v22.b }[4], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 13f\n"
+      "11:"  // odd_loads_2_0
+      "tbz %x[width], #1, 12f\n"
+      "ldr h29, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h27, [x25], #0x2\n"
+      "ldr h21, [x24], #0x2\n"
+      "ldr h26, [x23], #0x2\n"
+      "ldr h25, [x22], #0x2\n"
+      "ldr h19, [x21], #0x2\n"
+      "ldr h22, [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v27.b }[2], [x25]\n"
+      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v26.b }[2], [x23]\n"
+      "ld1 { v25.b }[2], [x22]\n"
+      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v22.b }[2], [x20]\n"
+      "b 13f\n"
+      "12:"  // odd_loads_1_0
+      "ldr b29, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b27, [x25, #0x0]\n"
+      "ldr b21, [x24, #0x0]\n"
+      "ldr b26, [x23, #0x0]\n"
+      "ldr b25, [x22, #0x0]\n"
+      "ldr b19, [x21, #0x0]\n"
+      "ldr b22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "13:"  // Odd load end
+      "zip1 v23.4s, v29.4s, v27.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v28.4s, v21.4s\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.4s, v26.4s, v19.4s\n"
+      "sadalp v1.8h, v16.16b\n"
+      "zip1 v17.4s, v25.4s, v22.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "sadalp v0.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 14f\n"
+      "zip2 v24.4s, v23.4s, v20.4s\n"
+      "zip2 v20.4s, v18.4s, v17.4s\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "sadalp v1.8h, v24.16b\n"
+      "str q20, [%x[out_ptr], #0x10]\n"
+      "sadalp v0.8h, v20.16b\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 14f\n"
+      "zip2 v27.4s, v29.4s, v27.4s\n"
+      "zip2 v23.4s, v28.4s, v21.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v21.4s, v27.4s, v23.4s\n"
+      "str q21, [%x[out_ptr], #0x0]\n"
+      "zip2 v19.4s, v26.4s, v19.4s\n"
+      "sadalp v1.8h, v21.16b\n"
+      "zip2 v16.4s, v25.4s, v22.4s\n"
+      "zip1 v18.4s, v19.4s, v16.4s\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "sadalp v0.8h, v18.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 14f\n"
+      "zip2 v17.4s, v27.4s, v23.4s\n"
+      "zip2 v16.4s, v19.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "sadalp v1.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "sadalp v0.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "14:"  // Odds skip
+      "sadalp v31.4s, v1.8h\n"
+      "sadalp v30.4s, v0.8h\n"
+      "str q31, [%x[out_ptr], #0x0]\n"
+      "str q30, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
new file mode 100644
index 0000000000..7c7857bcd0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, true>(
+  uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v1.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v0.8h, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "movi v30.4s, #0x0\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v31.4s }, [%x[out_ptr]]\n"
+      "ldr q30, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x10\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0x1e\n"
+      "ble 4f\n"
+      "uadalp v31.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "uadalp v30.4s, v0.8h\n"
+      "movi v0.8h, #0x0\n"
+      "mov x19, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q29, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q28, [x26], #0x10\n"
+      "ldr q27, [x25], #0x10\n"
+      "zip1 v23.4s, v29.4s, v27.4s\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip2 v27.4s, v29.4s, v27.4s\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q26, [x23], #0x10\n"
+      "zip1 v20.4s, v28.4s, v21.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v24.4s, v23.4s, v20.4s\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "zip2 v23.4s, v28.4s, v21.4s\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.4s, v26.4s, v19.4s\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v21.4s, v27.4s, v23.4s\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "zip1 v17.4s, v25.4s, v22.4s\n"
+      "uadalp v1.8h, v16.16b\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "add x19, x19, #0x1\n"
+      "zip2 v20.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "zip2 v19.4s, v26.4s, v19.4s\n"
+      "uadalp v0.8h, v16.16b\n"
+      "zip2 v16.4s, v25.4s, v22.4s\n"
+      "str q24, [%x[out_ptr], #0x20]\n"
+      "zip1 v18.4s, v19.4s, v16.4s\n"
+      "uadalp v1.8h, v24.16b\n"
+      "zip2 v17.4s, v27.4s, v23.4s\n"
+      "str q20, [%x[out_ptr], #0x30]\n"
+      "zip2 v16.4s, v19.4s, v16.4s\n"
+      "str q21, [%x[out_ptr], #0x40]\n"
+      "str q18, [%x[out_ptr], #0x50]\n"
+      "uadalp v0.8h, v20.16b\n"
+      "str q17, [%x[out_ptr], #0x60]\n"
+      "uadalp v1.8h, v21.16b\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "uadalp v0.8h, v18.16b\n"
+      "cmp %x[width], #0x10\n"
+      "uadalp v1.8h, v17.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "uadalp v0.8h, v16.16b\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 14f\n"
+      "tbz %x[width], #3, 9f\n"
+      "ldr d29, [x27], #0x8\n"
+      "ldr d28, [x26], #0x8\n"
+      "ldr d27, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d22, [x20], #0x8\n"
+      "tbz %x[width], #2, 7f\n"
+      "ld1 { v29.s }[2], [x27], #0x4\n"
+      "ld1 { v28.s }[2], [x26], #0x4\n"
+      "ld1 { v27.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v26.s }[2], [x23], #0x4\n"
+      "ld1 { v25.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v22.s }[2], [x20], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v29.h }[6], [x27], #0x2\n"
+      "ld1 { v28.h }[6], [x26], #0x2\n"
+      "ld1 { v27.h }[6], [x25], #0x2\n"
+      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v26.h }[6], [x23], #0x2\n"
+      "ld1 { v25.h }[6], [x22], #0x2\n"
+      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v22.h }[6], [x20], #0x2\n"
+      "mov x19, #0x4\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[14], [x27]\n"
+      "ld1 { v28.b }[14], [x26]\n"
+      "ld1 { v27.b }[14], [x25]\n"
+      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v26.b }[14], [x23]\n"
+      "ld1 { v25.b }[14], [x22]\n"
+      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v22.b }[14], [x20]\n"
+      "b 13f\n"
+      "6:"  // odd_loads_1_12
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[12], [x27]\n"
+      "ld1 { v28.b }[12], [x26]\n"
+      "ld1 { v27.b }[12], [x25]\n"
+      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v26.b }[12], [x23]\n"
+      "ld1 { v25.b }[12], [x22]\n"
+      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v22.b }[12], [x20]\n"
+      "mov x19, #0x4\n"
+      "b 13f\n"
+      "7:"  // odd_loads_2_8
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v29.h }[4], [x27], #0x2\n"
+      "ld1 { v28.h }[4], [x26], #0x2\n"
+      "ld1 { v27.h }[4], [x25], #0x2\n"
+      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v26.h }[4], [x23], #0x2\n"
+      "ld1 { v25.h }[4], [x22], #0x2\n"
+      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v22.h }[4], [x20], #0x2\n"
+      "mov x19, #0x3\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[10], [x27]\n"
+      "ld1 { v28.b }[10], [x26]\n"
+      "ld1 { v27.b }[10], [x25]\n"
+      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v26.b }[10], [x23]\n"
+      "ld1 { v25.b }[10], [x22]\n"
+      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v22.b }[10], [x20]\n"
+      "b 13f\n"
+      "8:"  // odd_loads_1_8
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[8], [x27]\n"
+      "ld1 { v28.b }[8], [x26]\n"
+      "ld1 { v27.b }[8], [x25]\n"
+      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v26.b }[8], [x23]\n"
+      "ld1 { v25.b }[8], [x22]\n"
+      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v22.b }[8], [x20]\n"
+      "mov x19, #0x3\n"
+      "b 13f\n"
+      "9:"  // odd_loads_4_0
+      "tbz %x[width], #2, 11f\n"
+      "ldr s29, [x27], #0x4\n"
+      "ldr s28, [x26], #0x4\n"
+      "ldr s27, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr s25, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s22, [x20], #0x4\n"
+      "tbz %x[width], #1, 10f\n"
+      "ld1 { v29.h }[2], [x27], #0x2\n"
+      "ld1 { v28.h }[2], [x26], #0x2\n"
+      "ld1 { v27.h }[2], [x25], #0x2\n"
+      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v26.h }[2], [x23], #0x2\n"
+      "ld1 { v25.h }[2], [x22], #0x2\n"
+      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v22.h }[2], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[6], [x27]\n"
+      "ld1 { v28.b }[6], [x26]\n"
+      "ld1 { v27.b }[6], [x25]\n"
+      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v26.b }[6], [x23]\n"
+      "ld1 { v25.b }[6], [x22]\n"
+      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v22.b }[6], [x20]\n"
+      "b 13f\n"
+      "10:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[4], [x27]\n"
+      "ld1 { v28.b }[4], [x26]\n"
+      "ld1 { v27.b }[4], [x25]\n"
+      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v26.b }[4], [x23]\n"
+      "ld1 { v25.b }[4], [x22]\n"
+      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v22.b }[4], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 13f\n"
+      "11:"  // odd_loads_2_0
+      "tbz %x[width], #1, 12f\n"
+      "ldr h29, [x27], #0x2\n"
+      "ldr h28, [x26], #0x2\n"
+      "ldr h27, [x25], #0x2\n"
+      "ldr h21, [x24], #0x2\n"
+      "ldr h26, [x23], #0x2\n"
+      "ldr h25, [x22], #0x2\n"
+      "ldr h19, [x21], #0x2\n"
+      "ldr h22, [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v29.b }[2], [x27]\n"
+      "ld1 { v28.b }[2], [x26]\n"
+      "ld1 { v27.b }[2], [x25]\n"
+      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v26.b }[2], [x23]\n"
+      "ld1 { v25.b }[2], [x22]\n"
+      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v22.b }[2], [x20]\n"
+      "b 13f\n"
+      "12:"  // odd_loads_1_0
+      "ldr b29, [x27, #0x0]\n"
+      "ldr b28, [x26, #0x0]\n"
+      "ldr b27, [x25, #0x0]\n"
+      "ldr b21, [x24, #0x0]\n"
+      "ldr b26, [x23, #0x0]\n"
+      "ldr b25, [x22, #0x0]\n"
+      "ldr b19, [x21, #0x0]\n"
+      "ldr b22, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "13:"  // Odd load end
+      "zip1 v23.4s, v29.4s, v27.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v20.4s, v28.4s, v21.4s\n"
+      "zip1 v16.4s, v23.4s, v20.4s\n"
+      "str q16, [%x[out_ptr], #0x0]\n"
+      "zip1 v18.4s, v26.4s, v19.4s\n"
+      "uadalp v1.8h, v16.16b\n"
+      "zip1 v17.4s, v25.4s, v22.4s\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "uadalp v0.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 14f\n"
+      "zip2 v24.4s, v23.4s, v20.4s\n"
+      "zip2 v20.4s, v18.4s, v17.4s\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "uadalp v1.8h, v24.16b\n"
+      "str q20, [%x[out_ptr], #0x10]\n"
+      "uadalp v0.8h, v20.16b\n"
+      "subs x19, x19, #0x1\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 14f\n"
+      "zip2 v27.4s, v29.4s, v27.4s\n"
+      "zip2 v23.4s, v28.4s, v21.4s\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v21.4s, v27.4s, v23.4s\n"
+      "str q21, [%x[out_ptr], #0x0]\n"
+      "zip2 v19.4s, v26.4s, v19.4s\n"
+      "uadalp v1.8h, v21.16b\n"
+      "zip2 v16.4s, v25.4s, v22.4s\n"
+      "zip1 v18.4s, v19.4s, v16.4s\n"
+      "str q18, [%x[out_ptr], #0x10]\n"
+      "uadalp v0.8h, v18.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "beq 14f\n"
+      "zip2 v17.4s, v27.4s, v23.4s\n"
+      "zip2 v16.4s, v19.4s, v16.4s\n"
+      "str q17, [%x[out_ptr], #0x0]\n"
+      "uadalp v1.8h, v17.16b\n"
+      "str q16, [%x[out_ptr], #0x10]\n"
+      "uadalp v0.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      "14:"  // Odds skip
+      "uadalp v31.4s, v1.8h\n"
+      "uadalp v30.4s, v0.8h\n"
+      "str q31, [%x[out_ptr], #0x0]\n"
+      "str q30, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
new file mode 100644
index 0000000000..704a4c9210
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 8, VLType::None, false>(
+  int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x10\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q27, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q25, [x25], #0x10\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "str q24, [%x[out_ptr], #0x40]\n"
+      "str q21, [%x[out_ptr], #0x50]\n"
+      "str q18, [%x[out_ptr], #0x60]\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "cmp %x[width], #0x10\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 12f\n"
+      "tbz %x[width], #3, 7f\n"
+      "ldr d27, [x27], #0x8\n"
+      "ldr d24, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "tbz %x[width], #2, 5f\n"
+      "ld1 { v27.s }[2], [x27], #0x4\n"
+      "ld1 { v24.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "tbz %x[width], #1, 4f\n"
+      "ld1 { v27.h }[6], [x27], #0x2\n"
+      "ld1 { v24.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v18.h }[6], [x22], #0x2\n"
+      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[14], [x27]\n"
+      "ld1 { v24.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v18.b }[14], [x22]\n"
+      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v16.b }[14], [x20]\n"
+      "b 11f\n"
+      "4:"  // odd_loads_1_12
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[12], [x27]\n"
+      "ld1 { v24.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v18.b }[12], [x22]\n"
+      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v16.b }[12], [x20]\n"
+      "b 11f\n"
+      "5:"  // odd_loads_2_8
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v27.h }[4], [x27], #0x2\n"
+      "ld1 { v24.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v18.h }[4], [x22], #0x2\n"
+      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[10], [x27]\n"
+      "ld1 { v24.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v18.b }[10], [x22]\n"
+      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v16.b }[10], [x20]\n"
+      "b 11f\n"
+      "6:"  // odd_loads_1_8
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[8], [x27]\n"
+      "ld1 { v24.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v18.b }[8], [x22]\n"
+      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v16.b }[8], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 11f\n"
+      "7:"  // odd_loads_4_0
+      "tbz %x[width], #2, 9f\n"
+      "ldr s27, [x27], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s18, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v27.h }[2], [x27], #0x2\n"
+      "ld1 { v24.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v18.h }[2], [x22], #0x2\n"
+      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[6], [x27]\n"
+      "ld1 { v24.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v18.b }[6], [x22]\n"
+      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v16.b }[6], [x20]\n"
+      "b 11f\n"
+      "8:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[4], [x27]\n"
+      "ld1 { v24.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v18.b }[4], [x22]\n"
+      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v16.b }[4], [x20]\n"
+      "b 11f\n"
+      "9:"  // odd_loads_2_0
+      "tbz %x[width], #1, 10f\n"
+      "ldr h27, [x27], #0x2\n"
+      "ldr h24, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "ldr h21, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h18, [x22], #0x2\n"
+      "ldr h19, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 11f\n"
+      "ld1 { v27.b }[2], [x27]\n"
+      "ld1 { v24.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v18.b }[2], [x22]\n"
+      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v16.b }[2], [x20]\n"
+      "b 11f\n"
+      "10:"  // odd_loads_1_0
+      "ldr b27, [x27, #0x0]\n"
+      "ldr b24, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
+      "ldr b21, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b18, [x22, #0x0]\n"
+      "ldr b19, [x21, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "11:"  // Odd load end
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "beq 12f\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "str q21, [%x[out_ptr], #0x10]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q18, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "12:"  // Odds skip
+
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+template<>
+void interleave_block<8, 8, VLType::None, false>(
+  uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  int8_t * &out_cast = reinterpret_cast<int8_t * &>(out_ptr);
+  const int8_t * const * in_cast = reinterpret_cast<const int8_t * const *>(in);
+
+  interleave_block<8, 8, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false);
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
new file mode 100644
index 0000000000..2317ece790
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 8, VLType::None, true>(
+  int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v5.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v4.8h, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v3.8h, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "movi v2.8h, #0x0\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "movi v1.4s, #0x0\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "movi v0.4s, #0x0\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "movi v30.4s, #0x0\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "movi v29.4s, #0x0\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "movi v28.4s, #0x0\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v29.4s }, [%x[out_ptr]]\n"
+      "ldr q28, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x10\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0x3e\n"
+      "ble 4f\n"
+      "sadalp v1.4s, v5.8h\n"
+      "movi v5.8h, #0x0\n"
+      "sadalp v0.4s, v4.8h\n"
+      "movi v4.8h, #0x0\n"
+      "sadalp v31.4s, v3.8h\n"
+      "movi v3.8h, #0x0\n"
+      "sadalp v30.4s, v2.8h\n"
+      "movi v2.8h, #0x0\n"
+      "mov x19, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q27, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q25, [x25], #0x10\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "sadalp v5.8h, v26.16b\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "sadalp v4.8h, v23.16b\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "sadalp v3.8h, v20.16b\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "sadalp v2.8h, v17.16b\n"
+      "str q24, [%x[out_ptr], #0x40]\n"
+      "sadalp v5.8h, v24.16b\n"
+      "str q21, [%x[out_ptr], #0x50]\n"
+      "sadalp v4.8h, v21.16b\n"
+      "str q18, [%x[out_ptr], #0x60]\n"
+      "sadalp v3.8h, v18.16b\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "sadalp v2.8h, v16.16b\n"
+      "add x19, x19, #0x1\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "cmp %x[width], #0x10\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 14f\n"
+      "tbz %x[width], #3, 9f\n"
+      "ldr d27, [x27], #0x8\n"
+      "ldr d24, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "tbz %x[width], #2, 7f\n"
+      "ld1 { v27.s }[2], [x27], #0x4\n"
+      "ld1 { v24.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v27.h }[6], [x27], #0x2\n"
+      "ld1 { v24.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v18.h }[6], [x22], #0x2\n"
+      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[14], [x27]\n"
+      "ld1 { v24.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v18.b }[14], [x22]\n"
+      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v16.b }[14], [x20]\n"
+      "b 13f\n"
+      "6:"  // odd_loads_1_12
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[12], [x27]\n"
+      "ld1 { v24.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v18.b }[12], [x22]\n"
+      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v16.b }[12], [x20]\n"
+      "b 13f\n"
+      "7:"  // odd_loads_2_8
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v27.h }[4], [x27], #0x2\n"
+      "ld1 { v24.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v18.h }[4], [x22], #0x2\n"
+      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[10], [x27]\n"
+      "ld1 { v24.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v18.b }[10], [x22]\n"
+      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v16.b }[10], [x20]\n"
+      "b 13f\n"
+      "8:"  // odd_loads_1_8
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[8], [x27]\n"
+      "ld1 { v24.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v18.b }[8], [x22]\n"
+      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v16.b }[8], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 13f\n"
+      "9:"  // odd_loads_4_0
+      "tbz %x[width], #2, 11f\n"
+      "ldr s27, [x27], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s18, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "tbz %x[width], #1, 10f\n"
+      "ld1 { v27.h }[2], [x27], #0x2\n"
+      "ld1 { v24.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v18.h }[2], [x22], #0x2\n"
+      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[6], [x27]\n"
+      "ld1 { v24.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v18.b }[6], [x22]\n"
+      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v16.b }[6], [x20]\n"
+      "b 13f\n"
+      "10:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[4], [x27]\n"
+      "ld1 { v24.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v18.b }[4], [x22]\n"
+      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v16.b }[4], [x20]\n"
+      "b 13f\n"
+      "11:"  // odd_loads_2_0
+      "tbz %x[width], #1, 12f\n"
+      "ldr h27, [x27], #0x2\n"
+      "ldr h24, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "ldr h21, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h18, [x22], #0x2\n"
+      "ldr h19, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[2], [x27]\n"
+      "ld1 { v24.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v18.b }[2], [x22]\n"
+      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v16.b }[2], [x20]\n"
+      "b 13f\n"
+      "12:"  // odd_loads_1_0
+      "ldr b27, [x27, #0x0]\n"
+      "ldr b24, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
+      "ldr b21, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b18, [x22, #0x0]\n"
+      "ldr b19, [x21, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "13:"  // Odd load end
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "sadalp v5.8h, v26.16b\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "sadalp v4.8h, v23.16b\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "sadalp v3.8h, v20.16b\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "sadalp v2.8h, v17.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "beq 14f\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "sadalp v5.8h, v24.16b\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q21, [%x[out_ptr], #0x10]\n"
+      "sadalp v4.8h, v21.16b\n"
+      "str q18, [%x[out_ptr], #0x20]\n"
+      "sadalp v3.8h, v18.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "sadalp v2.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "14:"  // Odds skip
+      "sadalp v1.4s, v5.8h\n"
+      "sadalp v0.4s, v4.8h\n"
+      "addp v1.4s, v1.4s, v0.4s\n"
+      "sadalp v31.4s, v3.8h\n"
+      "sadalp v30.4s, v2.8h\n"
+      "add v1.4s, v1.4s, v29.4s\n"
+      "str q1, [%x[out_ptr], #0x0]\n"
+      "addp v0.4s, v31.4s, v30.4s\n"
+      "add v0.4s, v0.4s, v28.4s\n"
+      "str q0, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
new file mode 100644
index 0000000000..07164d6b24
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 8, VLType::None, true>(
+  uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+  size_t row_offset, bool first
+)
+{
+  __asm__ __volatile__(
+      "movi v5.8h, #0x0\n"
+      "ldr x27, [%x[in], #0x0]\n"
+      "mov x19, #0x0\n"
+      "movi v4.8h, #0x0\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "cmp %x[height], #0x8\n"
+      "movi v3.8h, #0x0\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "add x27, x27, %x[row_offset]\n"
+      "movi v2.8h, #0x0\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "movi v1.4s, #0x0\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x26, x26, %x[row_offset]\n"
+      "movi v0.4s, #0x0\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "add x25, x25, %x[row_offset]\n"
+      "movi v31.4s, #0x0\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset]\n"
+      "movi v30.4s, #0x0\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset]\n"
+      "add x22, x22, %x[row_offset]\n"
+      "add x21, x21, %x[row_offset]\n"
+      "add x20, x20, %x[row_offset]\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "movi v29.4s, #0x0\n"
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "movi v28.4s, #0x0\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "cbnz %w[first], 2f\n"
+      "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+      "ld1 { v29.4s }, [%x[out_ptr]]\n"
+      "ldr q28, [%x[out_ptr], #0x10]\n"
+      "2:"  // first_pass
+      "cmp %x[width], #0x10\n"
+      "blt 5f\n"
+      "3:"  // Main loop head
+      "cmp x19, #0x3e\n"
+      "ble 4f\n"
+      "uadalp v1.4s, v5.8h\n"
+      "movi v5.8h, #0x0\n"
+      "uadalp v0.4s, v4.8h\n"
+      "movi v4.8h, #0x0\n"
+      "uadalp v31.4s, v3.8h\n"
+      "movi v3.8h, #0x0\n"
+      "uadalp v30.4s, v2.8h\n"
+      "movi v2.8h, #0x0\n"
+      "mov x19, #0x0\n"
+      "4:"  // no_accumulate_16
+      "ldr q27, [x27], #0x10\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "ldr q24, [x26], #0x10\n"
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      "ldr q25, [x25], #0x10\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "uadalp v5.8h, v26.16b\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "uadalp v4.8h, v23.16b\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "uadalp v3.8h, v20.16b\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "uadalp v2.8h, v17.16b\n"
+      "str q24, [%x[out_ptr], #0x40]\n"
+      "uadalp v5.8h, v24.16b\n"
+      "str q21, [%x[out_ptr], #0x50]\n"
+      "uadalp v4.8h, v21.16b\n"
+      "str q18, [%x[out_ptr], #0x60]\n"
+      "uadalp v3.8h, v18.16b\n"
+      "str q16, [%x[out_ptr], #0x70]\n"
+      "uadalp v2.8h, v16.16b\n"
+      "add x19, x19, #0x1\n"
+      "subs %x[width], %x[width], #0x10\n"
+      "cmp %x[width], #0x10\n"
+      "add %x[out_ptr], %x[out_ptr], #0x80\n"
+      "bge 3b\n"
+      "5:"  // Main loop skip
+      "cbz %x[width], 14f\n"
+      "tbz %x[width], #3, 9f\n"
+      "ldr d27, [x27], #0x8\n"
+      "ldr d24, [x26], #0x8\n"
+      "ldr d25, [x25], #0x8\n"
+      "ldr d21, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "tbz %x[width], #2, 7f\n"
+      "ld1 { v27.s }[2], [x27], #0x4\n"
+      "ld1 { v24.s }[2], [x26], #0x4\n"
+      "ld1 { v25.s }[2], [x25], #0x4\n"
+      "ld1 { v21.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v18.s }[2], [x22], #0x4\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "tbz %x[width], #1, 6f\n"
+      "ld1 { v27.h }[6], [x27], #0x2\n"
+      "ld1 { v24.h }[6], [x26], #0x2\n"
+      "ld1 { v25.h }[6], [x25], #0x2\n"
+      "ld1 { v21.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v18.h }[6], [x22], #0x2\n"
+      "ld1 { v19.h }[6], [x21], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[14], [x27]\n"
+      "ld1 { v24.b }[14], [x26]\n"
+      "ld1 { v25.b }[14], [x25]\n"
+      "ld1 { v21.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v18.b }[14], [x22]\n"
+      "ld1 { v19.b }[14], [x21]\n"
+      "ld1 { v16.b }[14], [x20]\n"
+      "b 13f\n"
+      "6:"  // odd_loads_1_12
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[12], [x27]\n"
+      "ld1 { v24.b }[12], [x26]\n"
+      "ld1 { v25.b }[12], [x25]\n"
+      "ld1 { v21.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v18.b }[12], [x22]\n"
+      "ld1 { v19.b }[12], [x21]\n"
+      "ld1 { v16.b }[12], [x20]\n"
+      "b 13f\n"
+      "7:"  // odd_loads_2_8
+      "tbz %x[width], #1, 8f\n"
+      "ld1 { v27.h }[4], [x27], #0x2\n"
+      "ld1 { v24.h }[4], [x26], #0x2\n"
+      "ld1 { v25.h }[4], [x25], #0x2\n"
+      "ld1 { v21.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v18.h }[4], [x22], #0x2\n"
+      "ld1 { v19.h }[4], [x21], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "mov x19, #0x2\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[10], [x27]\n"
+      "ld1 { v24.b }[10], [x26]\n"
+      "ld1 { v25.b }[10], [x25]\n"
+      "ld1 { v21.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v18.b }[10], [x22]\n"
+      "ld1 { v19.b }[10], [x21]\n"
+      "ld1 { v16.b }[10], [x20]\n"
+      "b 13f\n"
+      "8:"  // odd_loads_1_8
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[8], [x27]\n"
+      "ld1 { v24.b }[8], [x26]\n"
+      "ld1 { v25.b }[8], [x25]\n"
+      "ld1 { v21.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v18.b }[8], [x22]\n"
+      "ld1 { v19.b }[8], [x21]\n"
+      "ld1 { v16.b }[8], [x20]\n"
+      "mov x19, #0x2\n"
+      "b 13f\n"
+      "9:"  // odd_loads_4_0
+      "tbz %x[width], #2, 11f\n"
+      "ldr s27, [x27], #0x4\n"
+      "ldr s24, [x26], #0x4\n"
+      "ldr s25, [x25], #0x4\n"
+      "ldr s21, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s18, [x22], #0x4\n"
+      "ldr s19, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "tbz %x[width], #1, 10f\n"
+      "ld1 { v27.h }[2], [x27], #0x2\n"
+      "ld1 { v24.h }[2], [x26], #0x2\n"
+      "ld1 { v25.h }[2], [x25], #0x2\n"
+      "ld1 { v21.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v18.h }[2], [x22], #0x2\n"
+      "ld1 { v19.h }[2], [x21], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[6], [x27]\n"
+      "ld1 { v24.b }[6], [x26]\n"
+      "ld1 { v25.b }[6], [x25]\n"
+      "ld1 { v21.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v18.b }[6], [x22]\n"
+      "ld1 { v19.b }[6], [x21]\n"
+      "ld1 { v16.b }[6], [x20]\n"
+      "b 13f\n"
+      "10:"  // odd_loads_1_4
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[4], [x27]\n"
+      "ld1 { v24.b }[4], [x26]\n"
+      "ld1 { v25.b }[4], [x25]\n"
+      "ld1 { v21.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v18.b }[4], [x22]\n"
+      "ld1 { v19.b }[4], [x21]\n"
+      "ld1 { v16.b }[4], [x20]\n"
+      "b 13f\n"
+      "11:"  // odd_loads_2_0
+      "tbz %x[width], #1, 12f\n"
+      "ldr h27, [x27], #0x2\n"
+      "ldr h24, [x26], #0x2\n"
+      "ldr h25, [x25], #0x2\n"
+      "ldr h21, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h18, [x22], #0x2\n"
+      "ldr h19, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "mov x19, #0x1\n"
+      "tbz %x[width], #0, 13f\n"
+      "ld1 { v27.b }[2], [x27]\n"
+      "ld1 { v24.b }[2], [x26]\n"
+      "ld1 { v25.b }[2], [x25]\n"
+      "ld1 { v21.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v18.b }[2], [x22]\n"
+      "ld1 { v19.b }[2], [x21]\n"
+      "ld1 { v16.b }[2], [x20]\n"
+      "b 13f\n"
+      "12:"  // odd_loads_1_0
+      "ldr b27, [x27, #0x0]\n"
+      "ldr b24, [x26, #0x0]\n"
+      "ldr b25, [x25, #0x0]\n"
+      "ldr b21, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b18, [x22, #0x0]\n"
+      "ldr b19, [x21, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
+      "mov x19, #0x1\n"
+      "13:"  // Odd load end
+      "zip1 v26.2d, v27.2d, v24.2d\n"
+      "subs x19, x19, #0x1\n"
+      "zip1 v23.2d, v25.2d, v21.2d\n"
+      "str q26, [%x[out_ptr], #0x0]\n"
+      "zip1 v20.2d, v22.2d, v18.2d\n"
+      "uadalp v5.8h, v26.16b\n"
+      "zip1 v17.2d, v19.2d, v16.2d\n"
+      "str q23, [%x[out_ptr], #0x10]\n"
+      "uadalp v4.8h, v23.16b\n"
+      "str q20, [%x[out_ptr], #0x20]\n"
+      "uadalp v3.8h, v20.16b\n"
+      "str q17, [%x[out_ptr], #0x30]\n"
+      "uadalp v2.8h, v17.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "beq 14f\n"
+      "zip2 v24.2d, v27.2d, v24.2d\n"
+      "zip2 v21.2d, v25.2d, v21.2d\n"
+      "str q24, [%x[out_ptr], #0x0]\n"
+      "zip2 v18.2d, v22.2d, v18.2d\n"
+      "uadalp v5.8h, v24.16b\n"
+      "zip2 v16.2d, v19.2d, v16.2d\n"
+      "str q21, [%x[out_ptr], #0x10]\n"
+      "uadalp v4.8h, v21.16b\n"
+      "str q18, [%x[out_ptr], #0x20]\n"
+      "uadalp v3.8h, v18.16b\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "uadalp v2.8h, v16.16b\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "14:"  // Odds skip
+      "uadalp v1.4s, v5.8h\n"
+      "uadalp v0.4s, v4.8h\n"
+      "addp v1.4s, v1.4s, v0.4s\n"
+      "uadalp v31.4s, v3.8h\n"
+      "uadalp v30.4s, v2.8h\n"
+      "add v1.4s, v1.4s, v29.4s\n"
+      "str q1, [%x[out_ptr], #0x0]\n"
+      "addp v0.4s, v31.4s, v30.4s\n"
+      "add v0.4s, v0.4s, v28.4s\n"
+      "str q0, [%x[out_ptr], #0x10]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x20\n"
+      : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+      : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp
new file mode 100644
index 0000000000..52b49c0f0c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "a32_interleave6_block1_fp32_fp32.hpp"
+#include "a64_interleave4_block16_s8_s8.hpp"
+#include "a64_interleave4_block16_s8_s8_summing.hpp"
+#include "a64_interleave4_block16_u8_u8_summing.hpp"
+#include "a64_interleave8_block1_bf16_fp32.hpp"
+#include "a64_interleave8_block1_fp16_fp16.hpp"
+#include "a64_interleave8_block1_fp16_fp32.hpp"
+#include "a64_interleave8_block1_fp32_fp32.hpp"
+#include "a64_interleave8_block1_s16_s16.hpp"
+#include "a64_interleave8_block1_s16_s16_summing.hpp"
+#include "a64_interleave8_block1_s8_s16.hpp"
+#include "a64_interleave8_block1_s8_s16_summing.hpp"
+#include "a64_interleave8_block1_u16_u16_summing.hpp"
+#include "a64_interleave8_block1_u8_u16.hpp"
+#include "a64_interleave8_block1_u8_u16_summing.hpp"
+#include "a64_interleave8_block2_bf16_bf16.hpp"
+#include "a64_interleave8_block2_fp32_fp32.hpp"
+#include "a64_interleave8_block4_bf16_bf16.hpp"
+#include "a64_interleave8_block4_s8_s8.hpp"
+#include "a64_interleave8_block4_s8_s8_summing.hpp"
+#include "a64_interleave8_block4_u8_u8_summing.hpp"
+#include "a64_interleave8_block8_s8_s8.hpp"
+#include "a64_interleave8_block8_s8_s8_summing.hpp"
+#include "a64_interleave8_block8_u8_u8_summing.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
new file mode 100644
index 0000000000..2b3e170a3b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "asmlib.hpp"
+#include "convolution_parameters.hpp"
+#include "convolver.hpp"
+#include "interleave_indirect.hpp"
+#include "bfloat.hpp"
+
+#include <alloca.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include <arm_neon.h>
+
+#include "utils.hpp"
+
+namespace arm_gemm {
+
+/*
+ * Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together.
+ *
+ * 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining
+ * entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad
+ * with a particular value.
+ *
+ * Note that it is not expected for this templated version to ever be used - all cases that matter should be
+ * explicitly specialized with an optimized implementation.
+ */
+template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
+void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
+    const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+    std::vector<int32_t> the_sums;
+
+    if (integrate_sums) {
+        the_sums = std::vector<int32_t>(int_by, 0);
+
+        if (!first) {
+            // In 'integrate sums' mode, we dump the sums at the end on each pass.
+
+            // On the last pass this is correct, but on other passes it is not -
+            // so on the subsequent pass we need to take the output written by
+            // the previous pass as starting point for the sums, and then
+            // overwrite them with new interleaved data.
+            int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+            // Rewind pointer to where we wrote out the sums last time.
+            out_int32 -= int_by;
+
+            // Restore the running sums.
+            memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
+
+            // Update the "real" pointer so that the next output will clobber the old sums.
+            out = reinterpret_cast<TOut *>(out_int32);
+        }
+    }
+
+    for (unsigned int pos=0; pos<width; pos+=block) {
+        for (unsigned int row=0; row<int_by; row++) {
+            // Row out of range - pad 'block' entries.
+            if (row >= height) {
+                for (unsigned int col=0; col<block; col++) {
+                    *out++ = 0;
+                }
+                continue;
+            }
+
+            for (unsigned int col=0; col<block; col++) {
+                // Column out of range - pad a single entry
+                if (pos + col >= width) {
+                    *out++ = 0;
+                    continue;
+                }
+
+                if (integrate_sums) {
+                    the_sums[row] += in[row][row_offset + pos + col];
+                }
+
+                *out++ = in[row][row_offset + pos + col];
+            }
+        }
+    }
+
+    if (integrate_sums) {
+        int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+        memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
+
+        out = reinterpret_cast<TOut *>(out_int32 + int_by);
+    }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
+inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
+    const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+    // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
+    if (row_sum_multiplier) {
+        // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
+        // next block (post sums).
+        // We need to go back and apply the multiplier to the computed sums.  We don't need to change 'out'.
+        int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+        out_int32 -= height;
+        for (unsigned int i=0; i<height; i++) {
+            out_int32[i] *= row_sum_multiplier;
+        }
+    } else {
+        // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
+        // sum block.  We need to insert the (zero) sums, and advance 'out'.
+        int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+        for (unsigned int i=0; i<height; i++) {
+            out_int32[i] = 0;
+        }
+
+        out_int32 += height;
+
+        out = reinterpret_cast<TOut *>(out_int32);
+    }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen,
+                        unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
+                        const unsigned int k0, const unsigned int kmax, bool integrate_sums,
+                        const int32_t row_sum_multiplier) {
+    const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+    // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
+    // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
+    // out of range rows).  This allows interleave_block to use techniques like row predication, or loading all
+    // pointers and conditionally overriding the out of range ones.
+
+    // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
+    // range reads.  Avoid this with a local buffer to use in last-rows cases.  Use alloca as a std::vector can be
+    // expensive in highly threaded scenarios.
+    const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+    // Figure out the starting position based on k0 (with rounded length)
+    unsigned int start_string      = k0 / rounded_stringlen;
+    unsigned int start_stringpos   = k0 % rounded_stringlen;
+
+    // Process blocks of 'height' height...
+    for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
+        // Height to process
+        unsigned int active_height = std::min(ymax - ybase, height);
+
+        // Track our progress through the various strings
+        unsigned int k_left    = (kmax - k0);
+        unsigned int string    = start_string;
+        unsigned int stringpos = start_stringpos;
+
+        bool first = true;
+
+        // Prepare to call 'interleave_block' above for each string encompassed by K range
+        while (k_left > 0) {
+            // Width to process - and the width we will generate (with padding)
+            unsigned int in_width   = std::min(k_left, stringlen - stringpos);
+            unsigned int out_width  = std::min(k_left, rounded_stringlen - stringpos);
+
+            const TIn * const *row_base = ptr[string] + ybase;
+
+            // If not all rows are valid, copy the ones that are into local array (see above comment).
+            if (active_height < height) {
+                for (unsigned int i=0; i<active_height; i++) {
+                    row_ptrs[i] = ptr[string][ybase + i];
+                }
+
+                row_base = row_ptrs;
+            }
+
+            // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
+            // much code.  However, integrated sums make no sense for non-integral types and won't ever be
+            // requested.  So put a type trait check here to avoid generating pointless code.
+            if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+                interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
+            } else {
+                interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
+            }
+
+            k_left -= out_width;
+            string++;
+            stringpos=0;
+            first=false;
+        }
+
+        if (std::is_integral<TOut>::value && integrate_sums) {
+            FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+        }
+    }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
+        const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+    const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+    auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
+
+    // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
+    const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+    for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
+        // How many of the rows are active - the rest will get padded in interleave_block.
+        unsigned int active_height   = std::min(ymax - ybase, height);
+        bool first = true;
+
+        auto conv_rows = conv_cols.process_rows(ybase, active_height);
+
+        while (!conv_rows.finished()) {
+            unsigned int width, offset;
+
+            // Get next set of parameters
+            std::tie(width, offset) = conv_rows.next_block(row_ptrs);
+
+            // Perform the interleave
+            if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+                interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
+            } else {
+                interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
+            }
+
+            first=false;
+        }
+
+        if (std::is_integral<TOut>::value && integrate_sums) {
+            FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+        }
+    }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+    const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+    // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
+    const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+    const unsigned int width=kmax-k0;
+
+    for (unsigned int y=y0; y<ymax; y+=height) {
+        for (unsigned int r=0; r<height; r++) {
+            row_ptrs[r] = in + ((y + r) * in_stride);
+        }
+
+        if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+            interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
+        } else {
+            interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
+        }
+
+        if (std::is_integral<TOut>::value && integrate_sums) {
+            FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+        }
+    }
+}
+
+#include "indirect-interleaves/list.hpp"
+
+/**** Instantiate needed implementations ****/
+
+/* AArch32 */
+#ifdef __arm__
+/* FP32 */
+/* NEON implementation (height 6) */
+template void IndirectInterleave<6, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<6, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<6, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FP16 */
+#if __ARM_FP16_ARGS
+/* NEON implementation using FP32 kernel (height 6) */
+template void IndirectInterleave<6, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+#endif /* __ARM_FP16_ARGS */
+
+/* BF16 */
+/* NEON implementation using FP32 kernel */
+template void IndirectInterleave<6, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+#endif
+
+/* AArch64 */
+#ifdef __aarch64__
+/* FP64 */
+/* NEON/SVE implementation (height 8) */
+template void IndirectInterleave<8, 1, VLType::None>(double *, const double * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(double *, const double *, size_t, const convolver<double> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(double *, const double *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FP32 */
+/* NEON/SVE implementation (height 8) */
+template void IndirectInterleave<8, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FMMLA */
+template void IndirectInterleave<8, 2, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 2, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 2, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FP16 */
+template void IndirectInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<8, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* BF16 */
+/* NEON/SVE BFDOT */
+template void IndirectInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON/SVE using FP32 kernel */
+template void IndirectInterleave<8, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* INT16 */
+template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, const convolver<int16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, const convolver<uint16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* INT8 */
+/* NEON SMLA/SMLAL (height 4, block 16) */
+template void IndirectInterleave<4, 16, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON SDOT (height 8, block 4) */
+template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* MMLA SMMLA (height 8, block 8) */
+template void IndirectInterleave<8, 8, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON SDOT (height 8, block 1) */
+template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON SMLA/SMLAL (height 4, block 16) */
+template void IndirectInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON SDOT (height 8, block 4) */
+template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* MMLA SMMLA (height 8, block 8) */
+template void IndirectInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON 16-bit (height 8, block 1) */
+template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+#endif // __aarch64__
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp
new file mode 100644
index 0000000000..660577f0e3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#include "convolution_parameters.hpp"
+#include "convolver.hpp"
+#include "utils.hpp"
+
+namespace arm_gemm {
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen, unsigned int rounded_stringlen, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool, int32_t);
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool, int32_t);
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp
similarity index 87%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp
index 0f0e5a7ed4..8bf8d8442e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp
@@ -30,9 +30,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, int, int);
+void a64_gemm_s16_asimd_8x12(const int16_t *, const int16_t *, int32_t *, int, int, int);
 
-// 12x8 SGEMM "strategy" class.
+// 8x12 SGEMM "strategy" class.
 //
 // This describes the characteristics of a family of kernels, in terms of
 // the required interleave properties and the output block size.
@@ -40,7 +40,7 @@ void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, i
 // All kernels in the family must share these characteristics.  The actual
 // kernel to be used can be chosen at runtime, based on the CPU_type
 // structure.
-class gemm_s16_12x8 {
+class cls_a64_gemm_s16_8x12 {
 public:
     typedef int16_t operand_type;
     typedef int32_t result_type;
@@ -62,10 +62,11 @@ class gemm_s16_12x8 {
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
 
-    kern_type kernel = a64_gemm_s16_asimd_12x8;
+    kern_type kernel = a64_gemm_s16_asimd_8x12;
 
-    gemm_s16_12x8(const CPUInfo *) { }
+    cls_a64_gemm_s16_8x12(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp
index 7052f83a3d..a77938ffa7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+void a64_gemm_s16_asimd_8x12(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
 {
   const int16_t *a_ptr = Apanel;
   int32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index 256acc4c65..b68a5f518a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -34,7 +34,7 @@ void a64_gemm_s8_4x4(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
 #include "arm_gemm.hpp"
 
-class gemm_s8_4x4 {
+class cls_a64_gemm_s8_4x4 {
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
@@ -56,10 +56,11 @@ class gemm_s8_4x4 {
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
 
     kern_type kernel=a64_gemm_s8_4x4;
 
-    gemm_s8_4x4(const CPUInfo *) { }
+    cls_a64_gemm_s8_4x4(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
similarity index 80%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
index 0e294bfe8d..eee817e8e7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
@@ -32,11 +32,11 @@
 namespace arm_gemm {
 
 // Load the actual kernel
-void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
-void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
-void a64_gemm_s8_12x8_x1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_8x12_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_8x12_x1(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
-class gemm_s8_12x8 {
+class cls_a64_gemm_s8_8x12 {
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
@@ -58,16 +58,17 @@ class gemm_s8_12x8 {
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
 
-    kern_type kernel = a64_gemm_s8_12x8;
+    kern_type kernel = a64_gemm_s8_8x12;
 
-    gemm_s8_12x8(const CPUInfo *ci) {
+    cls_a64_gemm_s8_8x12(const CPUInfo *ci) {
         auto mod = ci->get_cpu_model();
 
         if (mod == CPUModel::A55r1) {
-            kernel = a64_gemm_s8_12x8_a55r1;
+            kernel = a64_gemm_s8_8x12_a55r1;
         } else if (mod == CPUModel::X1) {
-            kernel = a64_gemm_s8_12x8_x1;
+            kernel = a64_gemm_s8_8x12_x1;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp
index ddd8124ec9..bb5226e093 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
+void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
     const int8_t *a_ptr = Apanel;
     int32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp
index a7abaed9e0..7bf36a5900 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_s8_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
     const int8_t *a_ptr = Apanel;
     int32_t *c_ptr = Cpanel;
     // We divide K by 4 because the sdot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp
index 446fcf8707..afd2427b85 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_s8_12x8_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_s8_8x12_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
     const int8_t *a_ptr = Apanel;
     int32_t *c_ptr = Cpanel;
     // We divide K by 4 because the sdot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp
similarity index 78%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp
index b86204043c..e49ebbd84e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp
@@ -30,17 +30,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_gemm_u16_asimd_12x8(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
+void a64_gemm_u16_asimd_8x12(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
 
-// 12x8 SGEMM "strategy" class.
-//
-// This describes the characteristics of a family of kernels, in terms of
-// the required interleave properties and the output block size.
-//
-// All kernels in the family must share these characteristics.  The actual
-// kernel to be used can be chosen at runtime, based on the CPU_type
-// structure.
-class gemm_u16_12x8 {
+class cls_a64_gemm_u16_8x12 {
 public:
     typedef uint16_t operand_type;
     typedef uint32_t result_type;
@@ -62,10 +54,11 @@ class gemm_u16_12x8 {
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
 
-    kern_type kernel = a64_gemm_u16_asimd_12x8;
+    kern_type kernel = a64_gemm_u16_asimd_8x12;
 
-    gemm_u16_12x8(const CPUInfo *) { }
+    cls_a64_gemm_u16_8x12(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp
index 66f0b7c0ac..98da7830f0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+void a64_gemm_u16_asimd_8x12(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
 {
   const uint16_t *a_ptr = Apanel;
   uint32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 134007b74c..854b6751c1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -32,7 +32,7 @@ namespace arm_gemm {
 // Kernel definition
 void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K);
 
-class gemm_u8_4x4 {
+class cls_a64_gemm_u8_4x4 {
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
@@ -64,10 +64,11 @@ class gemm_u8_4x4 {
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
 
     kern_type kernel = a64_gemm_u8_4x4;
 
-    gemm_u8_4x4(const CPUInfo *) { }
+    cls_a64_gemm_u8_4x4(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
similarity index 82%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
index c0990ecd57..256ba2e08c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
@@ -30,11 +30,11 @@
 namespace arm_gemm {
 
 // Load the actual kernel
-void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-void a64_gemm_u8_12x8_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_8x12_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_8x12_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
-class gemm_u8_12x8 {
+class cls_a64_gemm_u8_8x12 {
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
@@ -66,16 +66,17 @@ class gemm_u8_12x8 {
 
     // Use the standard fixed sized transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
 
-    kern_type kernel = a64_gemm_u8_12x8;
+    kern_type kernel = a64_gemm_u8_8x12;
 
-    gemm_u8_12x8(const CPUInfo *ci) {
+    cls_a64_gemm_u8_8x12(const CPUInfo *ci) {
         auto mod = ci->get_cpu_model();
 
         if (mod == CPUModel::A55r1) {
-            kernel = a64_gemm_u8_12x8_a55r1;
+            kernel = a64_gemm_u8_8x12_a55r1;
         } else if (mod == CPUModel::X1) {
-            kernel = a64_gemm_u8_12x8_x1;
+            kernel = a64_gemm_u8_8x12_x1;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp
index c9a8a8229c..63869c9fd4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
+void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
     const uint8_t *a_ptr = Apanel;
     uint32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp
index 821e742f90..ff60cbc905 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_u8_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
     const uint8_t *a_ptr = Apanel;
     uint32_t *c_ptr = Cpanel;
     // We divide K by 4 because the udot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp
index 7fac67354f..1c1196b7a6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_gemm_u8_12x8_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_u8_8x12_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
     const uint8_t *a_ptr = Apanel;
     uint32_t *c_ptr = Cpanel;
     // We divide K by 4 because the udot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp
similarity index 75%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp
index b60401b70d..b53172509e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp
@@ -25,32 +25,26 @@
 
 #ifdef __aarch64__
 
-
+#include "../performance_parameters.hpp"
 #include "../std_transforms_fixed.hpp"
 
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_hybrid_fp32_mla_4x8(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_gemv_fp32_mla_32(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
 
-class hybrid_fp32_mla_4x8
+class cls_a64_gemv_fp32_mla_32
 {
 public:
     typedef float operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 8;
-    }
+    typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
 
     static unsigned int out_width()
     {
-        return 4;
+        return 32;
     }
 
     static constexpr unsigned int k_unroll()
@@ -73,14 +67,13 @@ class hybrid_fp32_mla_4x8
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 1, 32, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_hybrid_fp32_mla_4x8;
+    kern_type kernel=a64_gemv_fp32_mla_32;
 
-    hybrid_fp32_mla_4x8(const CPUInfo *)
+    cls_a64_gemv_fp32_mla_32(const CPUInfo *)
     {
-
     }
 };
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp
new file mode 100644
index 0000000000..a2af8d6d14
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp
@@ -0,0 +1,1546 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_gemv_fp32_mla_32 (
+    const float *A_ptr, const float *B_ptr, float *output_ptr,
+    size_t N, size_t K,
+    const float *bias, Activation act, bool
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "add x22, %x[N], #0x3\n"
+      "mov x21, %x[bias]\n"
+      "lsr x22, x22, #0x2\n"
+      "1:"  // Column loop
+      "cmp x22, #0x8\n"
+      "bge 85f\n"
+      "cmp x22, #0x6\n"
+      "bgt 73f\n"
+      "beq 61f\n"
+      "cmp x22, #0x4\n"
+      "bgt 49f\n"
+      "beq 37f\n"
+      "cmp x22, #0x2\n"
+      "bgt 25f\n"
+      "beq 13f\n"
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 2f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "b 3f\n"
+      "2:"  // Width 1: no bias
+      "movi v24.16b, #0x0\n"
+      "3:"  // Width 1: setup done
+      "cmp x20, #0x4\n"
+      "blt 6f\n"
+      "cmp x20, #0x8\n"
+      "blt 5f\n"
+      "4:"  // Width 1: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q2, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v2.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q3, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v3.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q4, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v4.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x20, x20, #0x4\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "cmp x20, #0x8\n"
+      "bge 4b\n"
+      "5:"  // Width 1: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q5, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v5.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q6, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v6.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q7, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v7.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q8, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "6:"  // Width 1: Multiply loop: Main loop skip
+      "cbz x20, 8f\n"
+      "7:"  // Width 1: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q9, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v9.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "sub x20, x20, #0x1\n"
+      "cbnz x20, 7b\n"
+      "8:"  // Width 1: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 9f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "9:"  // Width 1: No activation
+      "cmp %x[N], #0x4\n"
+      "blt 10f\n"
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 12f\n"
+      "10:"  // Width 1: Partial writeback
+      "tbz %x[N], #1, 11f\n"
+      "str d24, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 12f\n"
+      "st1 { v24.s }[2], [%x[output_ptr]]\n"
+      "b 12f\n"
+      "11:"  // Width 1: Partial direct writeback: partial_1_0
+      "str s24, [%x[output_ptr], #0x0]\n"
+      "12:"  // Width 1: Writeback done
+      "b 97f\n"
+      "13:"  // Width 2
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 14f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "add x21, x21, #0x20\n"
+      "b 15f\n"
+      "14:"  // Width 2: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "15:"  // Width 2: setup done
+      "cmp x20, #0x4\n"
+      "blt 18f\n"
+      "cmp x20, #0x8\n"
+      "blt 17f\n"
+      "16:"  // Width 2: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q3, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v3.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q4, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v4.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q5, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v5.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q6, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v6.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q7, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v7.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q8, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v8.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x20, x20, #0x4\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "cmp x20, #0x8\n"
+      "bge 16b\n"
+      "17:"  // Width 2: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q9, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v9.4s, v0.s[0]\n"
+      "ldr q10, [%x[B_ptr], #0x10]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v25.4s, v10.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q11, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v11.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q12, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v12.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q13, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v13.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q14, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v14.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q15, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q16, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v16.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "18:"  // Width 2: Multiply loop: Main loop skip
+      "cbz x20, 20f\n"
+      "19:"  // Width 2: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q17, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v17.4s, v0.s[0]\n"
+      "ldr q18, [%x[B_ptr], #0x10]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v25.4s, v18.4s, v0.s[0]\n"
+      "sub x20, x20, #0x1\n"
+      "cbnz x20, 19b\n"
+      "20:"  // Width 2: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 21f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "21:"  // Width 2: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "cmp %x[N], #0x8\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "blt 22f\n"
+      "str q25, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 24f\n"
+      "22:"  // Width 2: Partial writeback
+      "tbz %x[N], #1, 23f\n"
+      "str d25, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 24f\n"
+      "st1 { v25.s }[2], [%x[output_ptr]]\n"
+      "b 24f\n"
+      "23:"  // Width 2: Partial direct writeback: partial_1_4
+      "tbz %x[N], #0, 24f\n"
+      "str s25, [%x[output_ptr], #0x0]\n"
+      "24:"  // Width 2: Writeback done
+      "b 97f\n"
+      "25:"  // Width 3
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 26f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "add x21, x21, #0x30\n"
+      "b 27f\n"
+      "26:"  // Width 3: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "27:"  // Width 3: setup done
+      "cmp x20, #0x4\n"
+      "blt 30f\n"
+      "cmp x20, #0x8\n"
+      "blt 29f\n"
+      "28:"  // Width 3: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "ldr q3, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v26.4s, v3.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v4.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q5, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v5.4s, v0.s[1]\n"
+      "ldr q6, [%x[B_ptr], #0x20]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v6.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q7, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v7.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q8, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v8.4s, v0.s[2]\n"
+      "ldr q9, [%x[B_ptr], #0x20]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v9.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q10, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v10.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q11, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v11.4s, v0.s[3]\n"
+      "ldr q12, [%x[B_ptr], #0x20]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v12.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x20, x20, #0x4\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "cmp x20, #0x8\n"
+      "bge 28b\n"
+      "29:"  // Width 3: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q13, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v13.4s, v0.s[0]\n"
+      "ldr q14, [%x[B_ptr], #0x10]\n"
+      "ldr q15, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v14.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v26.4s, v15.4s, v0.s[0]\n"
+      "ldr q16, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q17, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v17.4s, v0.s[1]\n"
+      "ldr q18, [%x[B_ptr], #0x20]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v18.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q19, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v19.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q20, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v20.4s, v0.s[2]\n"
+      "ldr q21, [%x[B_ptr], #0x20]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v21.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q22, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v22.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q23, [%x[B_ptr], #0x10]\n"
+      "fmla v25.4s, v23.4s, v0.s[3]\n"
+      "ldr q1, [%x[B_ptr], #0x20]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v1.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "30:"  // Width 3: Multiply loop: Main loop skip
+      "cbz x20, 32f\n"
+      "31:"  // Width 3: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q2, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v2.4s, v0.s[0]\n"
+      "ldr q3, [%x[B_ptr], #0x10]\n"
+      "ldr q4, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v3.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v4.4s, v0.s[0]\n"
+      "sub x20, x20, #0x1\n"
+      "cbnz x20, 31b\n"
+      "32:"  // Width 3: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 33f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "33:"  // Width 3: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "str q25, [%x[output_ptr], #0x10]\n"
+      "cmp %x[N], #0xc\n"
+      "add %x[output_ptr], %x[output_ptr], #0x20\n"
+      "blt 34f\n"
+      "str q26, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 36f\n"
+      "34:"  // Width 3: Partial writeback
+      "tbz %x[N], #1, 35f\n"
+      "str d26, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 36f\n"
+      "st1 { v26.s }[2], [%x[output_ptr]]\n"
+      "b 36f\n"
+      "35:"  // Width 3: Partial direct writeback: partial_1_8
+      "tbz %x[N], #0, 36f\n"
+      "str s26, [%x[output_ptr], #0x0]\n"
+      "36:"  // Width 3: Writeback done
+      "b 97f\n"
+      "37:"  // Width 4
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 38f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "add x21, x21, #0x40\n"
+      "b 39f\n"
+      "38:"  // Width 4: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "39:"  // Width 4: setup done
+      "cmp x20, #0x4\n"
+      "blt 42f\n"
+      "cmp x20, #0x8\n"
+      "blt 41f\n"
+      "40:"  // Width 4: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "ldr q3, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v3.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v4.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q5, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v5.4s, v0.s[1]\n"
+      "ldr q6, [%x[B_ptr], #0x10]\n"
+      "ldr q7, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v6.4s, v0.s[1]\n"
+      "ldr q8, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v7.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v8.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q9, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v9.4s, v0.s[2]\n"
+      "ldr q10, [%x[B_ptr], #0x10]\n"
+      "ldr q11, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v10.4s, v0.s[2]\n"
+      "ldr q12, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v11.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v12.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q13, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v13.4s, v0.s[3]\n"
+      "ldr q14, [%x[B_ptr], #0x10]\n"
+      "ldr q15, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v14.4s, v0.s[3]\n"
+      "ldr q16, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v15.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v16.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "sub x20, x20, #0x4\n"
+      "cmp x20, #0x8\n"
+      "bge 40b\n"
+      "41:"  // Width 4: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q17, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v17.4s, v0.s[0]\n"
+      "ldr q18, [%x[B_ptr], #0x10]\n"
+      "ldr q19, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v18.4s, v0.s[0]\n"
+      "ldr q20, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v19.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v20.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q21, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v21.4s, v0.s[1]\n"
+      "ldr q22, [%x[B_ptr], #0x10]\n"
+      "ldr q23, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v22.4s, v0.s[1]\n"
+      "ldr q1, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v23.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v1.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q2, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v2.4s, v0.s[2]\n"
+      "ldr q3, [%x[B_ptr], #0x10]\n"
+      "ldr q4, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v3.4s, v0.s[2]\n"
+      "ldr q5, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v4.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v5.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q6, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v6.4s, v0.s[3]\n"
+      "ldr q7, [%x[B_ptr], #0x10]\n"
+      "ldr q8, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v7.4s, v0.s[3]\n"
+      "ldr q9, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v8.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v9.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add x19, x19, #0x10\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "42:"  // Width 4: Multiply loop: Main loop skip
+      "cbz x20, 44f\n"
+      "43:"  // Width 4: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q10, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v10.4s, v0.s[0]\n"
+      "ldr q11, [%x[B_ptr], #0x10]\n"
+      "ldr q12, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v11.4s, v0.s[0]\n"
+      "ldr q13, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v12.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "sub x20, x20, #0x1\n"
+      "fmla v27.4s, v13.4s, v0.s[0]\n"
+      "cbnz x20, 43b\n"
+      "44:"  // Width 4: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 45f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "45:"  // Width 4: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "str q25, [%x[output_ptr], #0x10]\n"
+      "str q26, [%x[output_ptr], #0x20]\n"
+      "cmp %x[N], #0x10\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "blt 46f\n"
+      "str q27, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 48f\n"
+      "46:"  // Width 4: Partial writeback
+      "tbz %x[N], #1, 47f\n"
+      "str d27, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 48f\n"
+      "st1 { v27.s }[2], [%x[output_ptr]]\n"
+      "b 48f\n"
+      "47:"  // Width 4: Partial direct writeback: partial_1_12
+      "tbz %x[N], #0, 48f\n"
+      "str s27, [%x[output_ptr], #0x0]\n"
+      "48:"  // Width 4: Writeback done
+      "b 97f\n"
+      "49:"  // Width 5
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 50f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "ldr q28, [x21, #0x40]\n"
+      "add x21, x21, #0x50\n"
+      "b 51f\n"
+      "50:"  // Width 5: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "51:"  // Width 5: setup done
+      "cmp x20, #0x4\n"
+      "blt 54f\n"
+      "cmp x20, #0x8\n"
+      "blt 53f\n"
+      "52:"  // Width 5: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "ldr q3, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v3.4s, v0.s[0]\n"
+      "ldr q5, [%x[B_ptr], #0x40]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v27.4s, v4.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q6, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v5.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q7, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v6.4s, v0.s[1]\n"
+      "ldr q8, [%x[B_ptr], #0x20]\n"
+      "ldr q9, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v7.4s, v0.s[1]\n"
+      "ldr q10, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v8.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v9.4s, v0.s[1]\n"
+      "ldr q11, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v10.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q12, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v11.4s, v0.s[2]\n"
+      "ldr q13, [%x[B_ptr], #0x20]\n"
+      "ldr q14, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v12.4s, v0.s[2]\n"
+      "ldr q15, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v13.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v14.4s, v0.s[2]\n"
+      "ldr q16, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v15.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q17, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v16.4s, v0.s[3]\n"
+      "ldr q18, [%x[B_ptr], #0x20]\n"
+      "ldr q19, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v17.4s, v0.s[3]\n"
+      "ldr q20, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v18.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v19.4s, v0.s[3]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v28.4s, v20.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x20, x20, #0x4\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "cmp x20, #0x8\n"
+      "bge 52b\n"
+      "53:"  // Width 5: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q21, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v21.4s, v0.s[0]\n"
+      "ldr q22, [%x[B_ptr], #0x10]\n"
+      "ldr q23, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v22.4s, v0.s[0]\n"
+      "ldr q1, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v23.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x40]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v27.4s, v1.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "ldr q3, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v2.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q4, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v3.4s, v0.s[1]\n"
+      "ldr q5, [%x[B_ptr], #0x20]\n"
+      "ldr q6, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v4.4s, v0.s[1]\n"
+      "ldr q7, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v5.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v6.4s, v0.s[1]\n"
+      "ldr q8, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v7.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q9, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v8.4s, v0.s[2]\n"
+      "ldr q10, [%x[B_ptr], #0x20]\n"
+      "ldr q11, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v9.4s, v0.s[2]\n"
+      "ldr q12, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v10.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v11.4s, v0.s[2]\n"
+      "ldr q13, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v12.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q14, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v13.4s, v0.s[3]\n"
+      "ldr q15, [%x[B_ptr], #0x20]\n"
+      "ldr q16, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v14.4s, v0.s[3]\n"
+      "ldr q17, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v15.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v16.4s, v0.s[3]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v28.4s, v17.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "54:"  // Width 5: Multiply loop: Main loop skip
+      "cbz x20, 56f\n"
+      "55:"  // Width 5: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q18, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v18.4s, v0.s[0]\n"
+      "ldr q19, [%x[B_ptr], #0x10]\n"
+      "ldr q20, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v19.4s, v0.s[0]\n"
+      "ldr q21, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v20.4s, v0.s[0]\n"
+      "ldr q22, [%x[B_ptr], #0x40]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v27.4s, v21.4s, v0.s[0]\n"
+      "sub x20, x20, #0x1\n"
+      "fmla v28.4s, v22.4s, v0.s[0]\n"
+      "cbnz x20, 55b\n"
+      "56:"  // Width 5: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 57f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "57:"  // Width 5: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "str q25, [%x[output_ptr], #0x10]\n"
+      "str q26, [%x[output_ptr], #0x20]\n"
+      "str q27, [%x[output_ptr], #0x30]\n"
+      "cmp %x[N], #0x14\n"
+      "add %x[output_ptr], %x[output_ptr], #0x40\n"
+      "blt 58f\n"
+      "str q28, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 60f\n"
+      "58:"  // Width 5: Partial writeback
+      "tbz %x[N], #1, 59f\n"
+      "str d28, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 60f\n"
+      "st1 { v28.s }[2], [%x[output_ptr]]\n"
+      "b 60f\n"
+      "59:"  // Width 5: Partial direct writeback: partial_1_16
+      "tbz %x[N], #0, 60f\n"
+      "str s28, [%x[output_ptr], #0x0]\n"
+      "60:"  // Width 5: Writeback done
+      "b 97f\n"
+      "61:"  // Width 6
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 62f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "ldr q28, [x21, #0x40]\n"
+      "ldr q29, [x21, #0x50]\n"
+      "add x21, x21, #0x60\n"
+      "b 63f\n"
+      "62:"  // Width 6: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "63:"  // Width 6: setup done
+      "cmp x20, #0x4\n"
+      "blt 66f\n"
+      "cmp x20, #0x8\n"
+      "blt 65f\n"
+      "64:"  // Width 6: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "ldr q3, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v3.4s, v0.s[0]\n"
+      "ldr q5, [%x[B_ptr], #0x40]\n"
+      "ldr q6, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v4.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v5.4s, v0.s[0]\n"
+      "ldr q7, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v6.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q8, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v7.4s, v0.s[1]\n"
+      "ldr q9, [%x[B_ptr], #0x20]\n"
+      "ldr q10, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v8.4s, v0.s[1]\n"
+      "ldr q11, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v9.4s, v0.s[1]\n"
+      "ldr q12, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v10.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v11.4s, v0.s[1]\n"
+      "ldr q13, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v12.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q14, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v13.4s, v0.s[2]\n"
+      "ldr q15, [%x[B_ptr], #0x20]\n"
+      "ldr q16, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v14.4s, v0.s[2]\n"
+      "ldr q17, [%x[B_ptr], #0x40]\n"
+      "ldr q18, [%x[B_ptr], #0x50]\n"
+      "fmla v26.4s, v15.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v16.4s, v0.s[2]\n"
+      "ldr q19, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v17.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q20, [%x[B_ptr], #0x10]\n"
+      "fmla v29.4s, v18.4s, v0.s[2]\n"
+      "ldr q21, [%x[B_ptr], #0x20]\n"
+      "ldr q22, [%x[B_ptr], #0x30]\n"
+      "fmla v24.4s, v19.4s, v0.s[3]\n"
+      "ldr q23, [%x[B_ptr], #0x40]\n"
+      "ldr q1, [%x[B_ptr], #0x50]\n"
+      "fmla v25.4s, v20.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v26.4s, v21.4s, v0.s[3]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v27.4s, v22.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x20, x20, #0x4\n"
+      "fmla v28.4s, v23.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "cmp x20, #0x8\n"
+      "fmla v29.4s, v1.4s, v0.s[3]\n"
+      "bge 64b\n"
+      "65:"  // Width 6: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q2, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v2.4s, v0.s[0]\n"
+      "ldr q3, [%x[B_ptr], #0x10]\n"
+      "ldr q4, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v3.4s, v0.s[0]\n"
+      "ldr q5, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v4.4s, v0.s[0]\n"
+      "ldr q6, [%x[B_ptr], #0x40]\n"
+      "ldr q7, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v5.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v6.4s, v0.s[0]\n"
+      "ldr q8, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q9, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v8.4s, v0.s[1]\n"
+      "ldr q10, [%x[B_ptr], #0x20]\n"
+      "ldr q11, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v9.4s, v0.s[1]\n"
+      "ldr q12, [%x[B_ptr], #0x40]\n"
+      "fmla v26.4s, v10.4s, v0.s[1]\n"
+      "ldr q13, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v11.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v12.4s, v0.s[1]\n"
+      "ldr q14, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v13.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q15, [%x[B_ptr], #0x10]\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "ldr q16, [%x[B_ptr], #0x20]\n"
+      "ldr q17, [%x[B_ptr], #0x30]\n"
+      "fmla v25.4s, v15.4s, v0.s[2]\n"
+      "ldr q18, [%x[B_ptr], #0x40]\n"
+      "ldr q19, [%x[B_ptr], #0x50]\n"
+      "fmla v26.4s, v16.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v17.4s, v0.s[2]\n"
+      "ldr q20, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v18.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q21, [%x[B_ptr], #0x10]\n"
+      "fmla v29.4s, v19.4s, v0.s[2]\n"
+      "ldr q22, [%x[B_ptr], #0x20]\n"
+      "ldr q23, [%x[B_ptr], #0x30]\n"
+      "fmla v24.4s, v20.4s, v0.s[3]\n"
+      "ldr q1, [%x[B_ptr], #0x40]\n"
+      "ldr q2, [%x[B_ptr], #0x50]\n"
+      "fmla v25.4s, v21.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v26.4s, v22.4s, v0.s[3]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v27.4s, v23.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "fmla v28.4s, v1.4s, v0.s[3]\n"
+      "fmla v29.4s, v2.4s, v0.s[3]\n"
+      "66:"  // Width 6: Multiply loop: Main loop skip
+      "cbz x20, 68f\n"
+      "67:"  // Width 6: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q3, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v3.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x10]\n"
+      "ldr q5, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v4.4s, v0.s[0]\n"
+      "ldr q6, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v5.4s, v0.s[0]\n"
+      "ldr q7, [%x[B_ptr], #0x40]\n"
+      "ldr q8, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v6.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "sub x20, x20, #0x1\n"
+      "fmla v28.4s, v7.4s, v0.s[0]\n"
+      "fmla v29.4s, v8.4s, v0.s[0]\n"
+      "cbnz x20, 67b\n"
+      "68:"  // Width 6: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "fmax v29.4s, v29.4s, v17.4s\n"
+      "69:"  // Width 6: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "str q25, [%x[output_ptr], #0x10]\n"
+      "str q26, [%x[output_ptr], #0x20]\n"
+      "str q27, [%x[output_ptr], #0x30]\n"
+      "str q28, [%x[output_ptr], #0x40]\n"
+      "cmp %x[N], #0x18\n"
+      "add %x[output_ptr], %x[output_ptr], #0x50\n"
+      "blt 70f\n"
+      "str q29, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 72f\n"
+      "70:"  // Width 6: Partial writeback
+      "tbz %x[N], #1, 71f\n"
+      "str d29, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 72f\n"
+      "st1 { v29.s }[2], [%x[output_ptr]]\n"
+      "b 72f\n"
+      "71:"  // Width 6: Partial direct writeback: partial_1_20
+      "tbz %x[N], #0, 72f\n"
+      "str s29, [%x[output_ptr], #0x0]\n"
+      "72:"  // Width 6: Writeback done
+      "b 97f\n"
+      "73:"  // Width 7
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 74f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "ldr q28, [x21, #0x40]\n"
+      "ldr q29, [x21, #0x50]\n"
+      "ldr q30, [x21, #0x60]\n"
+      "add x21, x21, #0x70\n"
+      "b 75f\n"
+      "74:"  // Width 7: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "75:"  // Width 7: setup done
+      "cmp x20, #0x4\n"
+      "blt 78f\n"
+      "cmp x20, #0x8\n"
+      "blt 77f\n"
+      "76:"  // Width 7: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "ldr q3, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v3.4s, v0.s[0]\n"
+      "ldr q5, [%x[B_ptr], #0x40]\n"
+      "ldr q6, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v4.4s, v0.s[0]\n"
+      "ldr q7, [%x[B_ptr], #0x60]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v28.4s, v5.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v29.4s, v6.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q8, [%x[B_ptr], #0x0]\n"
+      "fmla v30.4s, v7.4s, v0.s[0]\n"
+      "ldr q9, [%x[B_ptr], #0x10]\n"
+      "ldr q10, [%x[B_ptr], #0x20]\n"
+      "fmla v24.4s, v8.4s, v0.s[1]\n"
+      "ldr q11, [%x[B_ptr], #0x30]\n"
+      "ldr q12, [%x[B_ptr], #0x40]\n"
+      "fmla v25.4s, v9.4s, v0.s[1]\n"
+      "ldr q13, [%x[B_ptr], #0x50]\n"
+      "fmla v26.4s, v10.4s, v0.s[1]\n"
+      "ldr q14, [%x[B_ptr], #0x60]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v27.4s, v11.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v12.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q15, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v13.4s, v0.s[1]\n"
+      "ldr q16, [%x[B_ptr], #0x10]\n"
+      "ldr q17, [%x[B_ptr], #0x20]\n"
+      "fmla v30.4s, v14.4s, v0.s[1]\n"
+      "ldr q18, [%x[B_ptr], #0x30]\n"
+      "fmla v24.4s, v15.4s, v0.s[2]\n"
+      "ldr q19, [%x[B_ptr], #0x40]\n"
+      "ldr q20, [%x[B_ptr], #0x50]\n"
+      "fmla v25.4s, v16.4s, v0.s[2]\n"
+      "ldr q21, [%x[B_ptr], #0x60]\n"
+      "fmla v26.4s, v17.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v18.4s, v0.s[2]\n"
+      "ldr q22, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v19.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q23, [%x[B_ptr], #0x10]\n"
+      "fmla v29.4s, v20.4s, v0.s[2]\n"
+      "ldr q1, [%x[B_ptr], #0x20]\n"
+      "ldr q2, [%x[B_ptr], #0x30]\n"
+      "fmla v30.4s, v21.4s, v0.s[2]\n"
+      "ldr q3, [%x[B_ptr], #0x40]\n"
+      "fmla v24.4s, v22.4s, v0.s[3]\n"
+      "ldr q4, [%x[B_ptr], #0x50]\n"
+      "ldr q5, [%x[B_ptr], #0x60]\n"
+      "fmla v25.4s, v23.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v1.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v2.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v28.4s, v3.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "sub x20, x20, #0x4\n"
+      "fmla v29.4s, v4.4s, v0.s[3]\n"
+      "cmp x20, #0x8\n"
+      "fmla v30.4s, v5.4s, v0.s[3]\n"
+      "bge 76b\n"
+      "77:"  // Width 7: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q6, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [%x[B_ptr], #0x10]\n"
+      "ldr q8, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v7.4s, v0.s[0]\n"
+      "ldr q9, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v8.4s, v0.s[0]\n"
+      "ldr q10, [%x[B_ptr], #0x40]\n"
+      "ldr q11, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v9.4s, v0.s[0]\n"
+      "ldr q12, [%x[B_ptr], #0x60]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v28.4s, v10.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v29.4s, v11.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q13, [%x[B_ptr], #0x0]\n"
+      "fmla v30.4s, v12.4s, v0.s[0]\n"
+      "ldr q14, [%x[B_ptr], #0x10]\n"
+      "ldr q15, [%x[B_ptr], #0x20]\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "ldr q16, [%x[B_ptr], #0x30]\n"
+      "ldr q17, [%x[B_ptr], #0x40]\n"
+      "fmla v25.4s, v14.4s, v0.s[1]\n"
+      "ldr q18, [%x[B_ptr], #0x50]\n"
+      "fmla v26.4s, v15.4s, v0.s[1]\n"
+      "ldr q19, [%x[B_ptr], #0x60]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v27.4s, v16.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v17.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q20, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v18.4s, v0.s[1]\n"
+      "ldr q21, [%x[B_ptr], #0x10]\n"
+      "ldr q22, [%x[B_ptr], #0x20]\n"
+      "fmla v30.4s, v19.4s, v0.s[1]\n"
+      "ldr q23, [%x[B_ptr], #0x30]\n"
+      "fmla v24.4s, v20.4s, v0.s[2]\n"
+      "ldr q1, [%x[B_ptr], #0x40]\n"
+      "ldr q2, [%x[B_ptr], #0x50]\n"
+      "fmla v25.4s, v21.4s, v0.s[2]\n"
+      "ldr q3, [%x[B_ptr], #0x60]\n"
+      "fmla v26.4s, v22.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v23.4s, v0.s[2]\n"
+      "ldr q4, [%x[B_ptr], #0x0]\n"
+      "fmla v28.4s, v1.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q5, [%x[B_ptr], #0x10]\n"
+      "fmla v29.4s, v2.4s, v0.s[2]\n"
+      "ldr q6, [%x[B_ptr], #0x20]\n"
+      "ldr q7, [%x[B_ptr], #0x30]\n"
+      "fmla v30.4s, v3.4s, v0.s[2]\n"
+      "ldr q8, [%x[B_ptr], #0x40]\n"
+      "fmla v24.4s, v4.4s, v0.s[3]\n"
+      "ldr q9, [%x[B_ptr], #0x50]\n"
+      "ldr q10, [%x[B_ptr], #0x60]\n"
+      "fmla v25.4s, v5.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v26.4s, v6.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v27.4s, v7.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v28.4s, v8.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "fmla v29.4s, v9.4s, v0.s[3]\n"
+      "fmla v30.4s, v10.4s, v0.s[3]\n"
+      "78:"  // Width 7: Multiply loop: Main loop skip
+      "cbz x20, 80f\n"
+      "79:"  // Width 7: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q11, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v11.4s, v0.s[0]\n"
+      "ldr q12, [%x[B_ptr], #0x10]\n"
+      "ldr q13, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v12.4s, v0.s[0]\n"
+      "ldr q14, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v13.4s, v0.s[0]\n"
+      "ldr q15, [%x[B_ptr], #0x40]\n"
+      "ldr q16, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v14.4s, v0.s[0]\n"
+      "ldr q17, [%x[B_ptr], #0x60]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "fmla v28.4s, v15.4s, v0.s[0]\n"
+      "fmla v29.4s, v16.4s, v0.s[0]\n"
+      "sub x20, x20, #0x1\n"
+      "fmla v30.4s, v17.4s, v0.s[0]\n"
+      "cbnz x20, 79b\n"
+      "80:"  // Width 7: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 81f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v16.4s\n"
+      "fmin v30.4s, v30.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "fmax v29.4s, v29.4s, v17.4s\n"
+      "fmax v30.4s, v30.4s, v17.4s\n"
+      "81:"  // Width 7: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "str q25, [%x[output_ptr], #0x10]\n"
+      "str q26, [%x[output_ptr], #0x20]\n"
+      "str q27, [%x[output_ptr], #0x30]\n"
+      "str q28, [%x[output_ptr], #0x40]\n"
+      "str q29, [%x[output_ptr], #0x50]\n"
+      "cmp %x[N], #0x1c\n"
+      "add %x[output_ptr], %x[output_ptr], #0x60\n"
+      "blt 82f\n"
+      "str q30, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 84f\n"
+      "82:"  // Width 7: Partial writeback
+      "tbz %x[N], #1, 83f\n"
+      "str d30, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 84f\n"
+      "st1 { v30.s }[2], [%x[output_ptr]]\n"
+      "b 84f\n"
+      "83:"  // Width 7: Partial direct writeback: partial_1_24
+      "tbz %x[N], #0, 84f\n"
+      "str s30, [%x[output_ptr], #0x0]\n"
+      "84:"  // Width 7: Writeback done
+      "b 97f\n"
+      "85:"  // Width 8
+      "mov x20, %x[K]\n"
+      "mov x19, %x[A_ptr]\n"
+      "cbz x21, 86f\n"
+      "ldr q24, [x21, #0x0]\n"
+      "ldr q25, [x21, #0x10]\n"
+      "ldr q26, [x21, #0x20]\n"
+      "ldr q27, [x21, #0x30]\n"
+      "ldr q28, [x21, #0x40]\n"
+      "ldr q29, [x21, #0x50]\n"
+      "ldr q30, [x21, #0x60]\n"
+      "ldr q31, [x21, #0x70]\n"
+      "add x21, x21, #0x80\n"
+      "b 87f\n"
+      "86:"  // Width 8: no bias
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "87:"  // Width 8: setup done
+      "cmp x20, #0x4\n"
+      "blt 90f\n"
+      "cmp x20, #0x8\n"
+      "blt 89f\n"
+      "88:"  // Width 8: Multiply loop: Main loop head
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q1, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v1.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x10]\n"
+      "ldr q3, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v2.4s, v0.s[0]\n"
+      "ldr q4, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v3.4s, v0.s[0]\n"
+      "ldr q5, [%x[B_ptr], #0x40]\n"
+      "ldr q6, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v4.4s, v0.s[0]\n"
+      "ldr q7, [%x[B_ptr], #0x60]\n"
+      "ldr q8, [%x[B_ptr], #0x70]\n"
+      "fmla v28.4s, v5.4s, v0.s[0]\n"
+      "fmla v29.4s, v6.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v30.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q9, [%x[B_ptr], #0x0]\n"
+      "fmla v31.4s, v8.4s, v0.s[0]\n"
+      "ldr q10, [%x[B_ptr], #0x10]\n"
+      "ldr q11, [%x[B_ptr], #0x20]\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "ldr q12, [%x[B_ptr], #0x30]\n"
+      "ldr q13, [%x[B_ptr], #0x40]\n"
+      "fmla v25.4s, v10.4s, v0.s[1]\n"
+      "fmla v26.4s, v11.4s, v0.s[1]\n"
+      "ldr q14, [%x[B_ptr], #0x50]\n"
+      "ldr q15, [%x[B_ptr], #0x60]\n"
+      "fmla v27.4s, v12.4s, v0.s[1]\n"
+      "ldr q16, [%x[B_ptr], #0x70]\n"
+      "fmla v28.4s, v13.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v29.4s, v14.4s, v0.s[1]\n"
+      "ldr q17, [%x[B_ptr], #0x0]\n"
+      "fmla v30.4s, v15.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q18, [%x[B_ptr], #0x10]\n"
+      "fmla v31.4s, v16.4s, v0.s[1]\n"
+      "ldr q19, [%x[B_ptr], #0x20]\n"
+      "ldr q20, [%x[B_ptr], #0x30]\n"
+      "fmla v24.4s, v17.4s, v0.s[2]\n"
+      "ldr q21, [%x[B_ptr], #0x40]\n"
+      "ldr q22, [%x[B_ptr], #0x50]\n"
+      "fmla v25.4s, v18.4s, v0.s[2]\n"
+      "ldr q23, [%x[B_ptr], #0x60]\n"
+      "fmla v26.4s, v19.4s, v0.s[2]\n"
+      "ldr q1, [%x[B_ptr], #0x70]\n"
+      "fmla v27.4s, v20.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v21.4s, v0.s[2]\n"
+      "ldr q2, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v22.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q3, [%x[B_ptr], #0x10]\n"
+      "fmla v30.4s, v23.4s, v0.s[2]\n"
+      "ldr q4, [%x[B_ptr], #0x20]\n"
+      "ldr q5, [%x[B_ptr], #0x30]\n"
+      "fmla v31.4s, v1.4s, v0.s[2]\n"
+      "ldr q6, [%x[B_ptr], #0x40]\n"
+      "fmla v24.4s, v2.4s, v0.s[3]\n"
+      "ldr q7, [%x[B_ptr], #0x50]\n"
+      "ldr q8, [%x[B_ptr], #0x60]\n"
+      "fmla v25.4s, v3.4s, v0.s[3]\n"
+      "ldr q9, [%x[B_ptr], #0x70]\n"
+      "fmla v26.4s, v4.4s, v0.s[3]\n"
+      "fmla v27.4s, v5.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v6.4s, v0.s[3]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v29.4s, v7.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x20, x20, #0x4\n"
+      "fmla v30.4s, v8.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "cmp x20, #0x8\n"
+      "fmla v31.4s, v9.4s, v0.s[3]\n"
+      "bge 88b\n"
+      "89:"  // Width 8: Multiply loop: Single iteration only
+      "sub x20, x20, #0x4\n"
+      "ldr q0, [x19, #0x0]\n"
+      "ldr q10, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v10.4s, v0.s[0]\n"
+      "ldr q11, [%x[B_ptr], #0x10]\n"
+      "ldr q12, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v11.4s, v0.s[0]\n"
+      "ldr q13, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v12.4s, v0.s[0]\n"
+      "ldr q14, [%x[B_ptr], #0x40]\n"
+      "ldr q15, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v13.4s, v0.s[0]\n"
+      "ldr q16, [%x[B_ptr], #0x60]\n"
+      "ldr q17, [%x[B_ptr], #0x70]\n"
+      "fmla v28.4s, v14.4s, v0.s[0]\n"
+      "fmla v29.4s, v15.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v30.4s, v16.4s, v0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q18, [%x[B_ptr], #0x0]\n"
+      "fmla v31.4s, v17.4s, v0.s[0]\n"
+      "ldr q19, [%x[B_ptr], #0x10]\n"
+      "ldr q20, [%x[B_ptr], #0x20]\n"
+      "fmla v24.4s, v18.4s, v0.s[1]\n"
+      "ldr q21, [%x[B_ptr], #0x30]\n"
+      "ldr q22, [%x[B_ptr], #0x40]\n"
+      "fmla v25.4s, v19.4s, v0.s[1]\n"
+      "fmla v26.4s, v20.4s, v0.s[1]\n"
+      "ldr q23, [%x[B_ptr], #0x50]\n"
+      "ldr q1, [%x[B_ptr], #0x60]\n"
+      "fmla v27.4s, v21.4s, v0.s[1]\n"
+      "ldr q2, [%x[B_ptr], #0x70]\n"
+      "fmla v28.4s, v22.4s, v0.s[1]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v29.4s, v23.4s, v0.s[1]\n"
+      "ldr q3, [%x[B_ptr], #0x0]\n"
+      "fmla v30.4s, v1.4s, v0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q4, [%x[B_ptr], #0x10]\n"
+      "fmla v31.4s, v2.4s, v0.s[1]\n"
+      "ldr q5, [%x[B_ptr], #0x20]\n"
+      "ldr q6, [%x[B_ptr], #0x30]\n"
+      "fmla v24.4s, v3.4s, v0.s[2]\n"
+      "ldr q7, [%x[B_ptr], #0x40]\n"
+      "ldr q8, [%x[B_ptr], #0x50]\n"
+      "fmla v25.4s, v4.4s, v0.s[2]\n"
+      "ldr q9, [%x[B_ptr], #0x60]\n"
+      "fmla v26.4s, v5.4s, v0.s[2]\n"
+      "ldr q10, [%x[B_ptr], #0x70]\n"
+      "fmla v27.4s, v6.4s, v0.s[2]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v7.4s, v0.s[2]\n"
+      "ldr q11, [%x[B_ptr], #0x0]\n"
+      "fmla v29.4s, v8.4s, v0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ldr q12, [%x[B_ptr], #0x10]\n"
+      "fmla v30.4s, v9.4s, v0.s[2]\n"
+      "ldr q13, [%x[B_ptr], #0x20]\n"
+      "ldr q14, [%x[B_ptr], #0x30]\n"
+      "fmla v31.4s, v10.4s, v0.s[2]\n"
+      "ldr q15, [%x[B_ptr], #0x40]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "ldr q16, [%x[B_ptr], #0x50]\n"
+      "ldr q17, [%x[B_ptr], #0x60]\n"
+      "fmla v25.4s, v12.4s, v0.s[3]\n"
+      "ldr q18, [%x[B_ptr], #0x70]\n"
+      "fmla v26.4s, v13.4s, v0.s[3]\n"
+      "fmla v27.4s, v14.4s, v0.s[3]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla v28.4s, v15.4s, v0.s[3]\n"
+      "add x19, x19, #0x10\n"
+      "fmla v29.4s, v16.4s, v0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla v30.4s, v17.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x19, #0x80]\n"
+      "fmla v31.4s, v18.4s, v0.s[3]\n"
+      "90:"  // Width 8: Multiply loop: Main loop skip
+      "cbz x20, 92f\n"
+      "91:"  // Width 8: Multiply loop: Odd block loop
+      "ldr s0, [x19], #0x4\n"
+      "ldr q19, [%x[B_ptr], #0x0]\n"
+      "fmla v24.4s, v19.4s, v0.s[0]\n"
+      "ldr q20, [%x[B_ptr], #0x10]\n"
+      "ldr q21, [%x[B_ptr], #0x20]\n"
+      "fmla v25.4s, v20.4s, v0.s[0]\n"
+      "ldr q22, [%x[B_ptr], #0x30]\n"
+      "fmla v26.4s, v21.4s, v0.s[0]\n"
+      "ldr q23, [%x[B_ptr], #0x40]\n"
+      "ldr q1, [%x[B_ptr], #0x50]\n"
+      "fmla v27.4s, v22.4s, v0.s[0]\n"
+      "ldr q2, [%x[B_ptr], #0x60]\n"
+      "ldr q3, [%x[B_ptr], #0x70]\n"
+      "fmla v28.4s, v23.4s, v0.s[0]\n"
+      "fmla v29.4s, v1.4s, v0.s[0]\n"
+      "add %x[B_ptr], %x[B_ptr], #0x80\n"
+      "sub x20, x20, #0x1\n"
+      "fmla v30.4s, v2.4s, v0.s[0]\n"
+      "fmla v31.4s, v3.4s, v0.s[0]\n"
+      "cbnz x20, 91b\n"
+      "92:"  // Width 8: Multiply loop: No odd multiplies
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 93f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v16.4s\n"
+      "fmin v30.4s, v30.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "fmax v29.4s, v29.4s, v17.4s\n"
+      "fmax v30.4s, v30.4s, v17.4s\n"
+      "fmin v31.4s, v31.4s, v16.4s\n"
+      "fmax v31.4s, v31.4s, v17.4s\n"
+      "93:"  // Width 8: No activation
+      "str q24, [%x[output_ptr], #0x0]\n"
+      "str q25, [%x[output_ptr], #0x10]\n"
+      "str q26, [%x[output_ptr], #0x20]\n"
+      "str q27, [%x[output_ptr], #0x30]\n"
+      "str q28, [%x[output_ptr], #0x40]\n"
+      "str q29, [%x[output_ptr], #0x50]\n"
+      "str q30, [%x[output_ptr], #0x60]\n"
+      "cmp %x[N], #0x20\n"
+      "add %x[output_ptr], %x[output_ptr], #0x70\n"
+      "blt 94f\n"
+      "str q31, [%x[output_ptr], #0x0]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x10\n"
+      "b 96f\n"
+      "94:"  // Width 8: Partial writeback
+      "tbz %x[N], #1, 95f\n"
+      "str d31, [%x[output_ptr]], #0x8\n"
+      "tbz %x[N], #0, 96f\n"
+      "st1 { v31.s }[2], [%x[output_ptr]]\n"
+      "b 96f\n"
+      "95:"  // Width 8: Partial direct writeback: partial_1_28
+      "tbz %x[N], #0, 96f\n"
+      "str s31, [%x[output_ptr], #0x0]\n"
+      "96:"  // Width 8: Writeback done
+      "subs x22, x22, #0x8\n"
+      "sub %x[N], %x[N], #0x20\n"
+      "bgt 1b\n"
+      "97:"  // Exit
+
+      : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
+      : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
similarity index 74%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
index 79cae6002a..24e258e4b8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
@@ -25,20 +25,21 @@
 
 #if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
 
+#include "../performance_parameters.hpp"
 #include "../std_transforms_fixed.hpp"
 
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-void a64_hgemm_asimd_24x8_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_8x24(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_8x24_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_8x24_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
 
-// 24x8 HGEMM "strategy" class.  Describes the kernel properties.
+// 8x24 HGEMM "strategy" class.  Describes the kernel properties.
 //
 // The generic "gemm_opt" function will instantiate one of these (allowing
 // the constructor to pick a kernel implementation).
-class hgemm_24x8 {
+class cls_a64_hgemm_8x24 {
 public:
     typedef __fp16 operand_type;
     typedef __fp16 result_type;
@@ -61,16 +62,27 @@ class hgemm_24x8 {
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 24> transforms = {};
 
+    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        switch (ci->get_cpu_model()) {
+            case CPUModel::A55r1:
+                return { 7.16, 1.14, 0.67 };
+
+            default:
+                return { 12.67, 3.98, 1.16 };
+        }
+    }
+
     // Default to the generic kernel
-    kern_type kernel = a64_hgemm_asimd_24x8;
+    kern_type kernel = a64_hgemm_asimd_8x24;
 
-    hgemm_24x8(const CPUInfo *ci) {
+    cls_a64_hgemm_8x24(const CPUInfo *ci) {
         auto model = ci->get_cpu_model();
 
         if (model == CPUModel::A55r1) {
-            kernel = a64_hgemm_asimd_24x8_a55r1;
+            kernel = a64_hgemm_asimd_8x24_a55r1;
         } else if (model == CPUModel::X1) {
-            kernel = a64_hgemm_asimd_24x8_x1;
+            kernel = a64_hgemm_asimd_8x24_x1;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
index 829ae30001..29cdd33893 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
@@ -41,7 +41,7 @@
 
 namespace arm_gemm {
 
-void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void a64_hgemm_asimd_8x24_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
     const __fp16 *a_ptr = Apanel;
     __fp16 *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
index 657fade944..c9c48dd1c0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
@@ -34,14 +34,14 @@
 // Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
 // Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
 // Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 24x8), the chunks being arranged in a row major fashion.
+// 8x24), the chunks being arranged in a row major fashion.
 //
 // Note that the intent of this is that either ablocks or bblocks will be 1
 // - this construction allows the output loop to proceed in either order.
 
 namespace arm_gemm {
 
-void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
     const __fp16 *a_ptr = Apanel;
     __fp16 *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
index 3bb8334126..a6d2405e7e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
@@ -34,14 +34,14 @@
 // Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
 // Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
 // Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 24x8), the chunks being arranged in a row major fashion.
+// 8x24), the chunks being arranged in a row major fashion.
 //
 // Note that the intent of this is that either ablocks or bblocks will be 1
 // - this construction allows the output loop to proceed in either order.
 
 namespace arm_gemm {
 
-void a64_hgemm_asimd_24x8_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void a64_hgemm_asimd_8x24_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
     const __fp16 *a_ptr = Apanel;
     __fp16 *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
new file mode 100644
index 0000000000..a76c9949de
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<bfloat16>, \
+   size_t, size_t, \
+   const bfloat16 *, \
+   IndirectOutputArg<float>, \
+   const float *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_bf16fp32_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_bf16fp32_dot_6x16
+{
+public:
+    typedef bfloat16 operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 2;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 6, 16, 2> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_bf16fp32_dot_6x16;
+
+    cls_a64_hybrid_bf16fp32_dot_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..be680ed645
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
@@ -0,0 +1,3668 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_bf16fp32_dot_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 186f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 149f\n"
+      "beq 112f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 75f\n"
+      "beq 38f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "cbz x14, 4f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "add x14, x14, #0x40\n"
+      "b 15f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 14f\n"
+      "cmp x16, #0x10\n"
+      "bge 13f\n"
+      "tbz x16, #3, 8f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x16, #2, 6f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 5f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 12f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 12f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 12f\n"
+      "6:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x16, #1, 7f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 12f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 12f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 12f\n"
+      "8:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x16, #2, 10f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 9f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 12f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 12f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "b 12f\n"
+      "10:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x16, #1, 11f\n"
+      "ldr d8, [x13], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "b 12f\n"
+      "11:"  // Height 1: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "12:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 15f\n"
+      "13:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "b 15f\n"
+      "14:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "15:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "16:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 18f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "b 18f\n"
+      "17:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "18:"  // Height 1: input setup done
+      "cmp x11, #0x8\n"
+      "blt 21f\n"
+      "cmp x11, #0x10\n"
+      "blt 20f\n"
+      "19:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "sub x11, x11, #0x8\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "cmp x11, #0x10\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      "bge 19b\n"
+      "20:"  // Height 1: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      "21:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 26f\n"
+      "cmp x11, #0x2\n"
+      "blt 23f\n"
+      "22:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "sub x11, x11, #0x2\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      "cmp x11, #0x2\n"
+      "bge 22b\n"
+      "cbz x11, 26f\n"
+      "23:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 24f\n"
+      "ldr s0, [x10], #0x4\n"
+      "tbz x11, #0, 25f\n"
+      "ld1 { v0.h }[2], [x10]\n"
+      "b 25f\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h0, [x10, #0x0]\n"
+      "25:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      "26:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 16b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "27:"  // Height 1: No activation
+      "cmp x16, #0x10\n"
+      "bge 36f\n"
+      "tbz x16, #3, 31f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x16, #2, 29f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 28f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x16, #0, 35f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 35f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 35f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 35f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 30f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x16, #0, 35f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 35f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 35f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 35f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 33f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 32f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x16, #0, 35f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 35f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 35f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 35f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 34f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x16, #0, 35f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 35f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "35:"  // Height 1: Partial direct writeback: Done
+      "b 37f\n"
+      "36:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "37:"  // Height 1: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 3b\n"
+      "b 224f\n"
+      "38:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 39f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 40f\n"
+      "39:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "40:"  // Height 2: Column loop
+      "cbz x14, 41f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v15.16b, v11.16b\n"
+      "b 52f\n"
+      "41:"  // Height 2: no bias
+      "tbz %x[flags], #0, 51f\n"
+      "cmp x16, #0x10\n"
+      "bge 50f\n"
+      "tbz x16, #3, 45f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "tbz x16, #2, 43f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 42f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "tbz x16, #0, 49f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "b 49f\n"
+      "42:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 49f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "b 49f\n"
+      "43:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x16, #1, 44f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 49f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "b 49f\n"
+      "44:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 49f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "b 49f\n"
+      "45:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x16, #2, 47f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 46f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "tbz x16, #0, 49f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "b 49f\n"
+      "46:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 49f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "b 49f\n"
+      "47:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x16, #1, 48f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 49f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "b 49f\n"
+      "48:"  // Height 2: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "49:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "b 52f\n"
+      "50:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "b 52f\n"
+      "51:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "52:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "53:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 54f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 55f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "b 55f\n"
+      "54:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "55:"  // Height 2: input setup done
+      "cmp x11, #0x8\n"
+      "blt 58f\n"
+      "cmp x11, #0x10\n"
+      "blt 57f\n"
+      "56:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "sub x11, x11, #0x8\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "cmp x11, #0x10\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      "bge 56b\n"
+      "57:"  // Height 2: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      "58:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 63f\n"
+      "cmp x11, #0x2\n"
+      "blt 60f\n"
+      "59:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "sub x11, x11, #0x2\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "cmp x11, #0x2\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      "bge 59b\n"
+      "cbz x11, 63f\n"
+      "60:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 61f\n"
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "tbz x11, #0, 62f\n"
+      "ld1 { v0.h }[2], [x10]\n"
+      "ld1 { v1.h }[2], [x28]\n"
+      "b 62f\n"
+      "61:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h0, [x10, #0x0]\n"
+      "ldr h1, [x28, #0x0]\n"
+      "62:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      "63:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 53b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 64f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "64:"  // Height 2: No activation
+      "cmp x16, #0x10\n"
+      "bge 73f\n"
+      "tbz x16, #3, 68f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "tbz x16, #2, 66f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 65f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "tbz x16, #0, 72f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "b 72f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 72f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "b 72f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 67f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "tbz x16, #0, 72f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "b 72f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 72f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "b 72f\n"
+      "68:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 70f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 69f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "tbz x16, #0, 72f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "b 72f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 72f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "b 72f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 71f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "tbz x16, #0, 72f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "b 72f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "72:"  // Height 2: Partial direct writeback: Done
+      "b 74f\n"
+      "73:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "74:"  // Height 2: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 40b\n"
+      "b 224f\n"
+      "75:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 76f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 77f\n"
+      "76:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "77:"  // Height 3: Column loop
+      "cbz x14, 78f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v13.16b, v9.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "b 89f\n"
+      "78:"  // Height 3: no bias
+      "tbz %x[flags], #0, 88f\n"
+      "cmp x16, #0x10\n"
+      "bge 87f\n"
+      "tbz x16, #3, 82f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "tbz x16, #2, 80f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 79f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "tbz x16, #0, 86f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "b 86f\n"
+      "79:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 86f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "b 86f\n"
+      "80:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x16, #1, 81f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 86f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "b 86f\n"
+      "81:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 86f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "b 86f\n"
+      "82:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x16, #2, 84f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 83f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "tbz x16, #0, 86f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "b 86f\n"
+      "83:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 86f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "b 86f\n"
+      "84:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x16, #1, 85f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 86f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "b 86f\n"
+      "85:"  // Height 3: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "86:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "b 89f\n"
+      "87:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "b 89f\n"
+      "88:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "89:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "90:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 91f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 92f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "b 92f\n"
+      "91:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "92:"  // Height 3: input setup done
+      "cmp x11, #0x8\n"
+      "blt 95f\n"
+      "cmp x11, #0x10\n"
+      "blt 94f\n"
+      "93:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "sub x11, x11, #0x8\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      "cmp x11, #0x10\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      "bge 93b\n"
+      "94:"  // Height 3: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      "95:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 100f\n"
+      "cmp x11, #0x2\n"
+      "blt 97f\n"
+      "96:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x2\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "cmp x11, #0x2\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      "bge 96b\n"
+      "cbz x11, 100f\n"
+      "97:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 98f\n"
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "tbz x11, #0, 99f\n"
+      "ld1 { v0.h }[2], [x10]\n"
+      "ld1 { v1.h }[2], [x28]\n"
+      "ld1 { v2.h }[2], [x26]\n"
+      "b 99f\n"
+      "98:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h0, [x10, #0x0]\n"
+      "ldr h1, [x28, #0x0]\n"
+      "ldr h2, [x26, #0x0]\n"
+      "99:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      "100:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 90b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 101f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "101:"  // Height 3: No activation
+      "cmp x16, #0x10\n"
+      "bge 110f\n"
+      "tbz x16, #3, 105f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "tbz x16, #2, 103f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 102f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "tbz x16, #0, 109f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "b 109f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 109f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "b 109f\n"
+      "103:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 104f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "tbz x16, #0, 109f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "b 109f\n"
+      "104:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 109f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "b 109f\n"
+      "105:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 107f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 106f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "tbz x16, #0, 109f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "b 109f\n"
+      "106:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 109f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "b 109f\n"
+      "107:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 108f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "tbz x16, #0, 109f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "b 109f\n"
+      "108:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "109:"  // Height 3: Partial direct writeback: Done
+      "b 111f\n"
+      "110:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "111:"  // Height 3: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 77b\n"
+      "b 224f\n"
+      "112:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 113f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 114f\n"
+      "113:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "114:"  // Height 4: Column loop
+      "cbz x14, 115f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "add x14, x14, #0x40\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "b 126f\n"
+      "115:"  // Height 4: no bias
+      "tbz %x[flags], #0, 125f\n"
+      "cmp x16, #0x10\n"
+      "bge 124f\n"
+      "tbz x16, #3, 119f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "tbz x16, #2, 117f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 116f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "tbz x16, #0, 123f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "b 123f\n"
+      "116:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 123f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "b 123f\n"
+      "117:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x16, #1, 118f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 123f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "b 123f\n"
+      "118:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 123f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "b 123f\n"
+      "119:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x16, #2, 121f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 120f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "tbz x16, #0, 123f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "b 123f\n"
+      "120:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 123f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "b 123f\n"
+      "121:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x16, #1, 122f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 123f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "b 123f\n"
+      "122:"  // Height 4: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "123:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "b 126f\n"
+      "124:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "b 126f\n"
+      "125:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "126:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "127:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 128f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 129f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "b 129f\n"
+      "128:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "129:"  // Height 4: input setup done
+      "cmp x11, #0x8\n"
+      "blt 132f\n"
+      "cmp x11, #0x10\n"
+      "blt 131f\n"
+      "130:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x11, x11, #0x8\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "cmp x11, #0x10\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      "bge 130b\n"
+      "131:"  // Height 4: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      "132:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 137f\n"
+      "cmp x11, #0x2\n"
+      "blt 134f\n"
+      "133:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x2\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "cmp x11, #0x2\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      "bge 133b\n"
+      "cbz x11, 137f\n"
+      "134:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 135f\n"
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "tbz x11, #0, 136f\n"
+      "ld1 { v0.h }[2], [x10]\n"
+      "ld1 { v1.h }[2], [x28]\n"
+      "ld1 { v2.h }[2], [x26]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "b 136f\n"
+      "135:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h0, [x10, #0x0]\n"
+      "ldr h1, [x28, #0x0]\n"
+      "ldr h2, [x26, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "136:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      "137:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 127b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 138f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "138:"  // Height 4: No activation
+      "cmp x16, #0x10\n"
+      "bge 147f\n"
+      "tbz x16, #3, 142f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "tbz x16, #2, 140f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 139f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "tbz x16, #0, 146f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "b 146f\n"
+      "139:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 146f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "b 146f\n"
+      "140:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 141f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "tbz x16, #0, 146f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "b 146f\n"
+      "141:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 146f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "b 146f\n"
+      "142:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 144f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 143f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "tbz x16, #0, 146f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "b 146f\n"
+      "143:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 146f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "b 146f\n"
+      "144:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 145f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x16, #0, 146f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "b 146f\n"
+      "145:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "146:"  // Height 4: Partial direct writeback: Done
+      "b 148f\n"
+      "147:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "148:"  // Height 4: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 114b\n"
+      "b 224f\n"
+      "149:"  // Height 5
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 150f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 151f\n"
+      "150:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "151:"  // Height 5: Column loop
+      "cbz x14, 152f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v24.16b, v8.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "b 163f\n"
+      "152:"  // Height 5: no bias
+      "tbz %x[flags], #0, 162f\n"
+      "cmp x16, #0x10\n"
+      "bge 161f\n"
+      "tbz x16, #3, 156f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 154f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 153f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "tbz x16, #0, 160f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "b 160f\n"
+      "153:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 160f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "b 160f\n"
+      "154:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x16, #1, 155f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 160f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "b 160f\n"
+      "155:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 160f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "b 160f\n"
+      "156:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x16, #2, 158f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 157f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "tbz x16, #0, 160f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "b 160f\n"
+      "157:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 160f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "b 160f\n"
+      "158:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x16, #1, 159f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 160f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "b 160f\n"
+      "159:"  // Height 5: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "160:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "b 163f\n"
+      "161:"  // Height 5: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "b 163f\n"
+      "162:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "163:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "164:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 165f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 166f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "b 166f\n"
+      "165:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "166:"  // Height 5: input setup done
+      "cmp x11, #0x8\n"
+      "blt 169f\n"
+      "cmp x11, #0x10\n"
+      "blt 168f\n"
+      "167:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x11, x11, #0x8\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "cmp x11, #0x10\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0d8  // bfdot v24.4s, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0f9  // bfdot v25.4s, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0da  // bfdot v26.4s, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0fb  // bfdot v27.4s, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8d8  // bfdot v24.4s, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8f9  // bfdot v25.4s, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8da  // bfdot v26.4s, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8fb  // bfdot v27.4s, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8d8  // bfdot v24.4s, v6.8h, v4.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8f9  // bfdot v25.4s, v7.8h, v4.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8da  // bfdot v26.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
+      "bge 167b\n"
+      "168:"  // Height 5: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0d8  // bfdot v24.4s, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0f9  // bfdot v25.4s, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0da  // bfdot v26.4s, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0fb  // bfdot v27.4s, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8d8  // bfdot v24.4s, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8f9  // bfdot v25.4s, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8da  // bfdot v26.4s, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8fb  // bfdot v27.4s, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8d8  // bfdot v24.4s, v6.8h, v4.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8f9  // bfdot v25.4s, v7.8h, v4.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8da  // bfdot v26.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
+      "169:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x11, 174f\n"
+      "cmp x11, #0x2\n"
+      "blt 171f\n"
+      "170:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x2\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "cmp x11, #0x2\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      "bge 170b\n"
+      "cbz x11, 174f\n"
+      "171:"  // Height 5: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 172f\n"
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "tbz x11, #0, 173f\n"
+      "ld1 { v0.h }[2], [x10]\n"
+      "ld1 { v1.h }[2], [x28]\n"
+      "ld1 { v2.h }[2], [x26]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "ld1 { v4.h }[2], [x22]\n"
+      "b 173f\n"
+      "172:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h0, [x10, #0x0]\n"
+      "ldr h1, [x28, #0x0]\n"
+      "ldr h2, [x26, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "ldr h4, [x22, #0x0]\n"
+      "173:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      "174:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 164b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 175f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "175:"  // Height 5: No activation
+      "cmp x16, #0x10\n"
+      "bge 184f\n"
+      "tbz x16, #3, 179f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 177f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 176f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "tbz x16, #0, 183f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "b 183f\n"
+      "176:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 183f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "b 183f\n"
+      "177:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 178f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "tbz x16, #0, 183f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "b 183f\n"
+      "178:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 183f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "b 183f\n"
+      "179:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 181f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 180f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "tbz x16, #0, 183f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "b 183f\n"
+      "180:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 183f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "b 183f\n"
+      "181:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 182f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x16, #0, 183f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "b 183f\n"
+      "182:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "183:"  // Height 5: Partial direct writeback: Done
+      "b 185f\n"
+      "184:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "185:"  // Height 5: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 151b\n"
+      "b 224f\n"
+      "186:"  // Height 6
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 187f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 188f\n"
+      "187:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "188:"  // Height 6: Column loop
+      "cbz x14, 189f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v24.16b, v8.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v28.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v29.16b, v9.16b\n"
+      "mov v30.16b, v10.16b\n"
+      "mov v31.16b, v11.16b\n"
+      "b 200f\n"
+      "189:"  // Height 6: no bias
+      "tbz %x[flags], #0, 199f\n"
+      "cmp x16, #0x10\n"
+      "bge 198f\n"
+      "tbz x16, #3, 193f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 191f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 190f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x16, #0, 197f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 197f\n"
+      "190:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 197f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 197f\n"
+      "191:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x16, #1, 192f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 197f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 197f\n"
+      "192:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 197f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 197f\n"
+      "193:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x16, #2, 195f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 194f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x16, #0, 197f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 197f\n"
+      "194:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 197f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 197f\n"
+      "195:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x16, #1, 196f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 197f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 197f\n"
+      "196:"  // Height 6: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "197:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "sub x21, x21, x19\n"
+      "b 200f\n"
+      "198:"  // Height 6: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "b 200f\n"
+      "199:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "200:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "201:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 202f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 203f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x20, x20, x19, LSL #1\n"
+      "b 203f\n"
+      "202:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "add x20, x22, x19, LSL #1\n"
+      "203:"  // Height 6: input setup done
+      "cmp x11, #0x8\n"
+      "blt 206f\n"
+      "cmp x11, #0x10\n"
+      "blt 205f\n"
+      "204:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x11, x11, #0x8\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      "cmp x11, #0x10\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0ff  // bfdot v31.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0d8  // bfdot v24.4s, v6.8h, v4.h[1]\n"
+      ".inst 0x4f65f0dc  // bfdot v28.4s, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0f9  // bfdot v25.4s, v7.8h, v4.h[1]\n"
+      ".inst 0x4f65f0fd  // bfdot v29.4s, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0da  // bfdot v26.4s, v6.8h, v4.h[1]\n"
+      ".inst 0x4f65f0de  // bfdot v30.4s, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0fb  // bfdot v27.4s, v7.8h, v4.h[1]\n"
+      ".inst 0x4f65f0ff  // bfdot v31.4s, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8d8  // bfdot v24.4s, v6.8h, v4.h[2]\n"
+      ".inst 0x4f45f8dc  // bfdot v28.4s, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8f9  // bfdot v25.4s, v7.8h, v4.h[2]\n"
+      ".inst 0x4f45f8fd  // bfdot v29.4s, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8da  // bfdot v26.4s, v6.8h, v4.h[2]\n"
+      ".inst 0x4f45f8de  // bfdot v30.4s, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8fb  // bfdot v27.4s, v7.8h, v4.h[2]\n"
+      ".inst 0x4f45f8ff  // bfdot v31.4s, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8d8  // bfdot v24.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f65f8dc  // bfdot v28.4s, v6.8h, v5.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8f9  // bfdot v25.4s, v7.8h, v4.h[3]\n"
+      ".inst 0x4f65f8fd  // bfdot v29.4s, v7.8h, v5.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8da  // bfdot v26.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f65f8de  // bfdot v30.4s, v6.8h, v5.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
+      ".inst 0x4f65f8ff  // bfdot v31.4s, v7.8h, v5.h[3]\n"
+      "bge 204b\n"
+      "205:"  // Height 6: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0ff  // bfdot v31.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      ".inst 0x4f60f0c8  // bfdot v8.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0cc  // bfdot v12.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d0  // bfdot v16.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d4  // bfdot v20.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0d8  // bfdot v24.4s, v6.8h, v4.h[1]\n"
+      ".inst 0x4f65f0dc  // bfdot v28.4s, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      ".inst 0x4f60f0e9  // bfdot v9.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ed  // bfdot v13.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f1  // bfdot v17.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f5  // bfdot v21.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0f9  // bfdot v25.4s, v7.8h, v4.h[1]\n"
+      ".inst 0x4f65f0fd  // bfdot v29.4s, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      ".inst 0x4f60f0ca  // bfdot v10.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ce  // bfdot v14.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f62f0d2  // bfdot v18.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f63f0d6  // bfdot v22.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f64f0da  // bfdot v26.4s, v6.8h, v4.h[1]\n"
+      ".inst 0x4f65f0de  // bfdot v30.4s, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f61f0ef  // bfdot v15.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f62f0f3  // bfdot v19.4s, v7.8h, v2.h[1]\n"
+      ".inst 0x4f63f0f7  // bfdot v23.4s, v7.8h, v3.h[1]\n"
+      ".inst 0x4f64f0fb  // bfdot v27.4s, v7.8h, v4.h[1]\n"
+      ".inst 0x4f65f0ff  // bfdot v31.4s, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      ".inst 0x4f40f8c8  // bfdot v8.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8cc  // bfdot v12.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d4  // bfdot v20.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8d8  // bfdot v24.4s, v6.8h, v4.h[2]\n"
+      ".inst 0x4f45f8dc  // bfdot v28.4s, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      ".inst 0x4f40f8e9  // bfdot v9.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ed  // bfdot v13.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f1  // bfdot v17.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f5  // bfdot v21.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8f9  // bfdot v25.4s, v7.8h, v4.h[2]\n"
+      ".inst 0x4f45f8fd  // bfdot v29.4s, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      ".inst 0x4f40f8ca  // bfdot v10.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ce  // bfdot v14.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f42f8d2  // bfdot v18.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f43f8d6  // bfdot v22.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f44f8da  // bfdot v26.4s, v6.8h, v4.h[2]\n"
+      ".inst 0x4f45f8de  // bfdot v30.4s, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      ".inst 0x4f40f8eb  // bfdot v11.4s, v7.8h, v0.h[2]\n"
+      ".inst 0x4f41f8ef  // bfdot v15.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f42f8f3  // bfdot v19.4s, v7.8h, v2.h[2]\n"
+      ".inst 0x4f43f8f7  // bfdot v23.4s, v7.8h, v3.h[2]\n"
+      ".inst 0x4f44f8fb  // bfdot v27.4s, v7.8h, v4.h[2]\n"
+      ".inst 0x4f45f8ff  // bfdot v31.4s, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      ".inst 0x4f60f8c8  // bfdot v8.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f61f8cc  // bfdot v12.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d0  // bfdot v16.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d4  // bfdot v20.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8d8  // bfdot v24.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f65f8dc  // bfdot v28.4s, v6.8h, v5.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      ".inst 0x4f60f8e9  // bfdot v9.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ed  // bfdot v13.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f1  // bfdot v17.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f5  // bfdot v21.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8f9  // bfdot v25.4s, v7.8h, v4.h[3]\n"
+      ".inst 0x4f65f8fd  // bfdot v29.4s, v7.8h, v5.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      ".inst 0x4f60f8ca  // bfdot v10.4s, v6.8h, v0.h[3]\n"
+      "add x15, x15, #0x100\n"
+      ".inst 0x4f61f8ce  // bfdot v14.4s, v6.8h, v1.h[3]\n"
+      ".inst 0x4f62f8d2  // bfdot v18.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f63f8d6  // bfdot v22.4s, v6.8h, v3.h[3]\n"
+      ".inst 0x4f64f8da  // bfdot v26.4s, v6.8h, v4.h[3]\n"
+      ".inst 0x4f65f8de  // bfdot v30.4s, v6.8h, v5.h[3]\n"
+      ".inst 0x4f60f8eb  // bfdot v11.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f61f8ef  // bfdot v15.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f62f8f3  // bfdot v19.4s, v7.8h, v2.h[3]\n"
+      ".inst 0x4f63f8f7  // bfdot v23.4s, v7.8h, v3.h[3]\n"
+      ".inst 0x4f64f8fb  // bfdot v27.4s, v7.8h, v4.h[3]\n"
+      ".inst 0x4f65f8ff  // bfdot v31.4s, v7.8h, v5.h[3]\n"
+      "206:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x11, 211f\n"
+      "cmp x11, #0x2\n"
+      "blt 208f\n"
+      "207:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x20], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x2\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      "cmp x11, #0x2\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0ff  // bfdot v31.4s, v7.8h, v5.h[0]\n"
+      "bge 207b\n"
+      "cbz x11, 211f\n"
+      "208:"  // Height 6: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 209f\n"
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x20], #0x4\n"
+      "tbz x11, #0, 210f\n"
+      "ld1 { v0.h }[2], [x10]\n"
+      "ld1 { v1.h }[2], [x28]\n"
+      "ld1 { v2.h }[2], [x26]\n"
+      "ld1 { v3.h }[2], [x24]\n"
+      "ld1 { v4.h }[2], [x22]\n"
+      "ld1 { v5.h }[2], [x20]\n"
+      "b 210f\n"
+      "209:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h0, [x10, #0x0]\n"
+      "ldr h1, [x28, #0x0]\n"
+      "ldr h2, [x26, #0x0]\n"
+      "ldr h3, [x24, #0x0]\n"
+      "ldr h4, [x22, #0x0]\n"
+      "ldr h5, [x20, #0x0]\n"
+      "210:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x15, #0x0]\n"
+      ".inst 0x4f40f0c8  // bfdot v8.4s, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      ".inst 0x4f41f0cc  // bfdot v12.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d0  // bfdot v16.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d4  // bfdot v20.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0d8  // bfdot v24.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0dc  // bfdot v28.4s, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      ".inst 0x4f40f0e9  // bfdot v9.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ed  // bfdot v13.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f1  // bfdot v17.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f5  // bfdot v21.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0f9  // bfdot v25.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0fd  // bfdot v29.4s, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      ".inst 0x4f41f0ce  // bfdot v14.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f42f0d2  // bfdot v18.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f44f0da  // bfdot v26.4s, v6.8h, v4.h[0]\n"
+      ".inst 0x4f45f0de  // bfdot v30.4s, v6.8h, v5.h[0]\n"
+      ".inst 0x4f40f0eb  // bfdot v11.4s, v7.8h, v0.h[0]\n"
+      ".inst 0x4f41f0ef  // bfdot v15.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f42f0f3  // bfdot v19.4s, v7.8h, v2.h[0]\n"
+      ".inst 0x4f43f0f7  // bfdot v23.4s, v7.8h, v3.h[0]\n"
+      ".inst 0x4f44f0fb  // bfdot v27.4s, v7.8h, v4.h[0]\n"
+      ".inst 0x4f45f0ff  // bfdot v31.4s, v7.8h, v5.h[0]\n"
+      "211:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 201b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 212f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmin v28.4s, v28.4s, v0.4s\n"
+      "fmin v29.4s, v29.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "fmax v28.4s, v28.4s, v1.4s\n"
+      "fmax v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v0.4s\n"
+      "fmin v31.4s, v31.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v1.4s\n"
+      "fmax v31.4s, v31.4s, v1.4s\n"
+      "212:"  // Height 6: No activation
+      "cmp x16, #0x10\n"
+      "bge 221f\n"
+      "tbz x16, #3, 216f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 214f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 213f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x16, #0, 220f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 220f\n"
+      "213:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 220f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "b 220f\n"
+      "214:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 215f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x16, #0, 220f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "b 220f\n"
+      "215:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 220f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "b 220f\n"
+      "216:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 218f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 217f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x16, #0, 220f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "b 220f\n"
+      "217:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 220f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "b 220f\n"
+      "218:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 219f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x16, #0, 220f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "b 220f\n"
+      "219:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "220:"  // Height 6: Partial direct writeback: Done
+      "b 222f\n"
+      "221:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "add x21, x21, #0x40\n"
+      "222:"  // Height 6: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 188b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 224f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 223f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "223:"  // Update direct input
+      "mov x19, #0xc\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "224:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
new file mode 100644
index 0000000000..876b63c811
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<__fp16>, \
+   size_t, size_t, \
+   const __fp16 *, \
+   IndirectOutputArg<__fp16>, \
+   const __fp16 *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_fp16_mla_6x32( ARGLIST );
+
+class cls_a64_hybrid_fp16_mla_6x32
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 32;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 6, 32, 1> transforms = {};
+
+    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        switch (ci->get_cpu_model()) {
+            case CPUModel::A55r1:
+                return { 5.22 };
+
+            default:
+                return { 14.53 };
+        }
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_fp16_mla_6x32;
+
+    cls_a64_hybrid_fp16_mla_6x32(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
new file mode 100644
index 0000000000..ff6cbec200
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
@@ -0,0 +1,5400 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp16_mla_6x32 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+    size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg,
+    const __fp16 *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const __fp16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+      ".arch  armv8.2-a+fp16\n"
+#endif
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 251f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 201f\n"
+      "beq 151f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 101f\n"
+      "beq 51f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "cbz x14, 4f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "add x14, x14, #0x40\n"
+      "b 23f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 22f\n"
+      "cmp x16, #0x20\n"
+      "bge 21f\n"
+      "tbz x16, #4, 12f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "tbz x16, #3, 8f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "tbz x16, #2, 6f\n"
+      "ldr d11, [x13], #0x8\n"
+      "tbz x16, #1, 5f\n"
+      "mov x19, #0x3c\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "b 20f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_28
+      "mov x19, #0x38\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "b 20f\n"
+      "6:"  // Height 1: Partial accumulate: partial_2_24
+      "tbz x16, #1, 7f\n"
+      "ldr s11, [x13], #0x4\n"
+      "mov x19, #0x34\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "b 20f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_24
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 20f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "b 20f\n"
+      "8:"  // Height 1: Partial accumulate: partial_4_16
+      "tbz x16, #2, 10f\n"
+      "ldr d10, [x13], #0x8\n"
+      "tbz x16, #1, 9f\n"
+      "mov x19, #0x2c\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "b 20f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_20
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "b 20f\n"
+      "10:"  // Height 1: Partial accumulate: partial_2_16
+      "tbz x16, #1, 11f\n"
+      "ldr s10, [x13], #0x4\n"
+      "mov x19, #0x24\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "b 20f\n"
+      "11:"  // Height 1: Partial accumulate: partial_1_16
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 20f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "b 20f\n"
+      "12:"  // Height 1: Partial accumulate: partial_8_0
+      "tbz x16, #3, 16f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "tbz x16, #2, 14f\n"
+      "ldr d9, [x13], #0x8\n"
+      "tbz x16, #1, 13f\n"
+      "mov x19, #0x1c\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "b 20f\n"
+      "13:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x18\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "b 20f\n"
+      "14:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x16, #1, 15f\n"
+      "ldr s9, [x13], #0x4\n"
+      "mov x19, #0x14\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "b 20f\n"
+      "15:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 20f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "b 20f\n"
+      "16:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x16, #2, 18f\n"
+      "ldr d8, [x13], #0x8\n"
+      "tbz x16, #1, 17f\n"
+      "mov x19, #0xc\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "b 20f\n"
+      "17:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "b 20f\n"
+      "18:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x16, #1, 19f\n"
+      "ldr s8, [x13], #0x4\n"
+      "mov x19, #0x4\n"
+      "tbz x16, #0, 20f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "b 20f\n"
+      "19:"  // Height 1: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr h8, [x13, #0x0]\n"
+      "20:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 23f\n"
+      "21:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "b 23f\n"
+      "22:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "23:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "24:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 25f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 26f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "b 26f\n"
+      "25:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "26:"  // Height 1: input setup done
+      "cmp x11, #0x8\n"
+      "blt 29f\n"
+      "cmp x11, #0x10\n"
+      "blt 28f\n"
+      "27:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "sub x11, x11, #0x8\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "cmp x11, #0x10\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "bge 27b\n"
+      "28:"  // Height 1: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "29:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 31f\n"
+      "30:"  // Height 1: Multiply loop: Odd block loop
+      "ldr h0, [x10], #0x2\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "sub x11, x11, #0x1\n"
+      "add x15, x15, #0x40\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "cbnz x11, 30b\n"
+      "31:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 24b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 32f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.8h }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x19]\n"
+      "fmin v8.8h, v8.8h, v0.8h\n"
+      "fmin v9.8h, v9.8h, v0.8h\n"
+      "fmin v10.8h, v10.8h, v0.8h\n"
+      "fmin v11.8h, v11.8h, v0.8h\n"
+      "fmax v8.8h, v8.8h, v1.8h\n"
+      "fmax v9.8h, v9.8h, v1.8h\n"
+      "fmax v10.8h, v10.8h, v1.8h\n"
+      "fmax v11.8h, v11.8h, v1.8h\n"
+      "32:"  // Height 1: No activation
+      "cmp x16, #0x20\n"
+      "bge 49f\n"
+      "tbz x16, #4, 40f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "tbz x16, #3, 36f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "tbz x16, #2, 34f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x16, #1, 33f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "b 48f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_1_28
+      "tbz x16, #0, 48f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "b 48f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_2_24
+      "tbz x16, #1, 35f\n"
+      "str s11, [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "b 48f\n"
+      "35:"  // Height 1: Partial direct writeback: partial_1_24
+      "tbz x16, #0, 48f\n"
+      "str h11, [x13, #0x0]\n"
+      "b 48f\n"
+      "36:"  // Height 1: Partial direct writeback: partial_4_16
+      "tbz x16, #2, 38f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x16, #1, 37f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "b 48f\n"
+      "37:"  // Height 1: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 48f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "b 48f\n"
+      "38:"  // Height 1: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 39f\n"
+      "str s10, [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "b 48f\n"
+      "39:"  // Height 1: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 48f\n"
+      "str h10, [x13, #0x0]\n"
+      "b 48f\n"
+      "40:"  // Height 1: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 44f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "tbz x16, #2, 42f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x16, #1, 41f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "b 48f\n"
+      "41:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 48f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "b 48f\n"
+      "42:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 43f\n"
+      "str s9, [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "b 48f\n"
+      "43:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 48f\n"
+      "str h9, [x13, #0x0]\n"
+      "b 48f\n"
+      "44:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 46f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x16, #1, 45f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "b 48f\n"
+      "45:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 48f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "b 48f\n"
+      "46:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 47f\n"
+      "str s8, [x13], #0x4\n"
+      "tbz x16, #0, 48f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "b 48f\n"
+      "47:"  // Height 1: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "48:"  // Height 1: Partial direct writeback: Done
+      "b 50f\n"
+      "49:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "50:"  // Height 1: Writeback done
+      "subs x16, x16, #0x20\n"
+      "bgt 3b\n"
+      "b 302f\n"
+      "51:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 52f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "b 53f\n"
+      "52:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "53:"  // Height 2: Column loop
+      "cbz x14, 54f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v15.16b, v11.16b\n"
+      "b 73f\n"
+      "54:"  // Height 2: no bias
+      "tbz %x[flags], #0, 72f\n"
+      "cmp x16, #0x20\n"
+      "bge 71f\n"
+      "tbz x16, #4, 62f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x9], #0x10\n"
+      "tbz x16, #3, 58f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x9], #0x10\n"
+      "tbz x16, #2, 56f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "tbz x16, #1, 55f\n"
+      "mov x19, #0x3c\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x9]\n"
+      "b 70f\n"
+      "55:"  // Height 2: Partial accumulate: partial_1_28
+      "mov x19, #0x38\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x9]\n"
+      "b 70f\n"
+      "56:"  // Height 2: Partial accumulate: partial_2_24
+      "tbz x16, #1, 57f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x9], #0x4\n"
+      "mov x19, #0x34\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x9]\n"
+      "b 70f\n"
+      "57:"  // Height 2: Partial accumulate: partial_1_24
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 70f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x9, #0x0]\n"
+      "b 70f\n"
+      "58:"  // Height 2: Partial accumulate: partial_4_16
+      "tbz x16, #2, 60f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "tbz x16, #1, 59f\n"
+      "mov x19, #0x2c\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x9]\n"
+      "b 70f\n"
+      "59:"  // Height 2: Partial accumulate: partial_1_20
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x9]\n"
+      "b 70f\n"
+      "60:"  // Height 2: Partial accumulate: partial_2_16
+      "tbz x16, #1, 61f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x9], #0x4\n"
+      "mov x19, #0x24\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x9]\n"
+      "b 70f\n"
+      "61:"  // Height 2: Partial accumulate: partial_1_16
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 70f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x9, #0x0]\n"
+      "b 70f\n"
+      "62:"  // Height 2: Partial accumulate: partial_8_0
+      "tbz x16, #3, 66f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "tbz x16, #2, 64f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "tbz x16, #1, 63f\n"
+      "mov x19, #0x1c\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x9]\n"
+      "b 70f\n"
+      "63:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x18\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x9]\n"
+      "b 70f\n"
+      "64:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x16, #1, 65f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x9], #0x4\n"
+      "mov x19, #0x14\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x9]\n"
+      "b 70f\n"
+      "65:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 70f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x9, #0x0]\n"
+      "b 70f\n"
+      "66:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x16, #2, 68f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "tbz x16, #1, 67f\n"
+      "mov x19, #0xc\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x9]\n"
+      "b 70f\n"
+      "67:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x9]\n"
+      "b 70f\n"
+      "68:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x16, #1, 69f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x9], #0x4\n"
+      "mov x19, #0x4\n"
+      "tbz x16, #0, 70f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x9]\n"
+      "b 70f\n"
+      "69:"  // Height 2: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x9, #0x0]\n"
+      "70:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "b 73f\n"
+      "71:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "b 73f\n"
+      "72:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "73:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "74:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 75f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 76f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "b 76f\n"
+      "75:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "76:"  // Height 2: input setup done
+      "cmp x11, #0x8\n"
+      "blt 79f\n"
+      "cmp x11, #0x10\n"
+      "blt 78f\n"
+      "77:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "sub x11, x11, #0x8\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "cmp x11, #0x10\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "bge 77b\n"
+      "78:"  // Height 2: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "79:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 81f\n"
+      "80:"  // Height 2: Multiply loop: Odd block loop
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "cbnz x11, 80b\n"
+      "81:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 74b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 82f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.8h }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x19]\n"
+      "fmin v8.8h, v8.8h, v0.8h\n"
+      "fmin v9.8h, v9.8h, v0.8h\n"
+      "fmin v10.8h, v10.8h, v0.8h\n"
+      "fmin v11.8h, v11.8h, v0.8h\n"
+      "fmax v8.8h, v8.8h, v1.8h\n"
+      "fmax v9.8h, v9.8h, v1.8h\n"
+      "fmax v10.8h, v10.8h, v1.8h\n"
+      "fmax v11.8h, v11.8h, v1.8h\n"
+      "fmin v12.8h, v12.8h, v0.8h\n"
+      "fmin v13.8h, v13.8h, v0.8h\n"
+      "fmin v14.8h, v14.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v1.8h\n"
+      "fmax v13.8h, v13.8h, v1.8h\n"
+      "fmax v14.8h, v14.8h, v1.8h\n"
+      "fmin v15.8h, v15.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v1.8h\n"
+      "82:"  // Height 2: No activation
+      "cmp x16, #0x20\n"
+      "bge 99f\n"
+      "tbz x16, #4, 90f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v13.8h }, [x9], #0x10\n"
+      "tbz x16, #3, 86f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x9], #0x10\n"
+      "tbz x16, #2, 84f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "tbz x16, #1, 83f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x9]\n"
+      "b 98f\n"
+      "83:"  // Height 2: Partial direct writeback: partial_1_28
+      "tbz x16, #0, 98f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x9]\n"
+      "b 98f\n"
+      "84:"  // Height 2: Partial direct writeback: partial_2_24
+      "tbz x16, #1, 85f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x9]\n"
+      "b 98f\n"
+      "85:"  // Height 2: Partial direct writeback: partial_1_24
+      "tbz x16, #0, 98f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x9, #0x0]\n"
+      "b 98f\n"
+      "86:"  // Height 2: Partial direct writeback: partial_4_16
+      "tbz x16, #2, 88f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "tbz x16, #1, 87f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x9]\n"
+      "b 98f\n"
+      "87:"  // Height 2: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 98f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x9]\n"
+      "b 98f\n"
+      "88:"  // Height 2: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 89f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x9]\n"
+      "b 98f\n"
+      "89:"  // Height 2: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 98f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x9, #0x0]\n"
+      "b 98f\n"
+      "90:"  // Height 2: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 94f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "tbz x16, #2, 92f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "tbz x16, #1, 91f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x9]\n"
+      "b 98f\n"
+      "91:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 98f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x9]\n"
+      "b 98f\n"
+      "92:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 93f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x9]\n"
+      "b 98f\n"
+      "93:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 98f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x9, #0x0]\n"
+      "b 98f\n"
+      "94:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 96f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "tbz x16, #1, 95f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x9]\n"
+      "b 98f\n"
+      "95:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 98f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x9]\n"
+      "b 98f\n"
+      "96:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 97f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "tbz x16, #0, 98f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x9]\n"
+      "b 98f\n"
+      "97:"  // Height 2: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x9, #0x0]\n"
+      "98:"  // Height 2: Partial direct writeback: Done
+      "b 100f\n"
+      "99:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "100:"  // Height 2: Writeback done
+      "subs x16, x16, #0x20\n"
+      "bgt 53b\n"
+      "b 302f\n"
+      "101:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 102f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "add x27, x27, x19, LSL #1\n"
+      "b 103f\n"
+      "102:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "103:"  // Height 3: Column loop
+      "cbz x14, 104f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v13.16b, v9.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "b 123f\n"
+      "104:"  // Height 3: no bias
+      "tbz %x[flags], #0, 122f\n"
+      "cmp x16, #0x20\n"
+      "bge 121f\n"
+      "tbz x16, #4, 112f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x9], #0x10\n"
+      "ld1 { v17.8h }, [x27], #0x10\n"
+      "tbz x16, #3, 108f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x9], #0x10\n"
+      "ld1 { v18.8h }, [x27], #0x10\n"
+      "tbz x16, #2, 106f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "tbz x16, #1, 105f\n"
+      "mov x19, #0x3c\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x9], #0x4\n"
+      "ld1 { v19.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x9]\n"
+      "ld1 { v19.h }[6], [x27]\n"
+      "b 120f\n"
+      "105:"  // Height 3: Partial accumulate: partial_1_28
+      "mov x19, #0x38\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x9]\n"
+      "ld1 { v19.h }[4], [x27]\n"
+      "b 120f\n"
+      "106:"  // Height 3: Partial accumulate: partial_2_24
+      "tbz x16, #1, 107f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x9], #0x4\n"
+      "ldr s19, [x27], #0x4\n"
+      "mov x19, #0x34\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x9]\n"
+      "ld1 { v19.h }[2], [x27]\n"
+      "b 120f\n"
+      "107:"  // Height 3: Partial accumulate: partial_1_24
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 120f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x9, #0x0]\n"
+      "ldr h19, [x27, #0x0]\n"
+      "b 120f\n"
+      "108:"  // Height 3: Partial accumulate: partial_4_16
+      "tbz x16, #2, 110f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "tbz x16, #1, 109f\n"
+      "mov x19, #0x2c\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x9], #0x4\n"
+      "ld1 { v18.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x9]\n"
+      "ld1 { v18.h }[6], [x27]\n"
+      "b 120f\n"
+      "109:"  // Height 3: Partial accumulate: partial_1_20
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x9]\n"
+      "ld1 { v18.h }[4], [x27]\n"
+      "b 120f\n"
+      "110:"  // Height 3: Partial accumulate: partial_2_16
+      "tbz x16, #1, 111f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x9], #0x4\n"
+      "ldr s18, [x27], #0x4\n"
+      "mov x19, #0x24\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x9]\n"
+      "ld1 { v18.h }[2], [x27]\n"
+      "b 120f\n"
+      "111:"  // Height 3: Partial accumulate: partial_1_16
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 120f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x9, #0x0]\n"
+      "ldr h18, [x27, #0x0]\n"
+      "b 120f\n"
+      "112:"  // Height 3: Partial accumulate: partial_8_0
+      "tbz x16, #3, 116f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "tbz x16, #2, 114f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "tbz x16, #1, 113f\n"
+      "mov x19, #0x1c\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x9], #0x4\n"
+      "ld1 { v17.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x9]\n"
+      "ld1 { v17.h }[6], [x27]\n"
+      "b 120f\n"
+      "113:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x18\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x9]\n"
+      "ld1 { v17.h }[4], [x27]\n"
+      "b 120f\n"
+      "114:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x16, #1, 115f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x9], #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "mov x19, #0x14\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x9]\n"
+      "ld1 { v17.h }[2], [x27]\n"
+      "b 120f\n"
+      "115:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 120f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x9, #0x0]\n"
+      "ldr h17, [x27, #0x0]\n"
+      "b 120f\n"
+      "116:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x16, #2, 118f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "tbz x16, #1, 117f\n"
+      "mov x19, #0xc\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x9], #0x4\n"
+      "ld1 { v16.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x9]\n"
+      "ld1 { v16.h }[6], [x27]\n"
+      "b 120f\n"
+      "117:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x9]\n"
+      "ld1 { v16.h }[4], [x27]\n"
+      "b 120f\n"
+      "118:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x16, #1, 119f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x9], #0x4\n"
+      "ldr s16, [x27], #0x4\n"
+      "mov x19, #0x4\n"
+      "tbz x16, #0, 120f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x9]\n"
+      "ld1 { v16.h }[2], [x27]\n"
+      "b 120f\n"
+      "119:"  // Height 3: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x9, #0x0]\n"
+      "ldr h16, [x27, #0x0]\n"
+      "120:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "b 123f\n"
+      "121:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "b 123f\n"
+      "122:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "123:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "124:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 125f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 126f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "b 126f\n"
+      "125:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "126:"  // Height 3: input setup done
+      "cmp x11, #0x8\n"
+      "blt 129f\n"
+      "cmp x11, #0x10\n"
+      "blt 128f\n"
+      "127:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "sub x11, x11, #0x8\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "cmp x11, #0x10\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "bge 127b\n"
+      "128:"  // Height 3: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "129:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 131f\n"
+      "130:"  // Height 3: Multiply loop: Odd block loop
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "cbnz x11, 130b\n"
+      "131:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 124b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 132f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.8h }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x19]\n"
+      "fmin v8.8h, v8.8h, v0.8h\n"
+      "fmin v9.8h, v9.8h, v0.8h\n"
+      "fmin v10.8h, v10.8h, v0.8h\n"
+      "fmin v11.8h, v11.8h, v0.8h\n"
+      "fmax v8.8h, v8.8h, v1.8h\n"
+      "fmax v9.8h, v9.8h, v1.8h\n"
+      "fmax v10.8h, v10.8h, v1.8h\n"
+      "fmax v11.8h, v11.8h, v1.8h\n"
+      "fmin v12.8h, v12.8h, v0.8h\n"
+      "fmin v13.8h, v13.8h, v0.8h\n"
+      "fmin v14.8h, v14.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v1.8h\n"
+      "fmax v13.8h, v13.8h, v1.8h\n"
+      "fmax v14.8h, v14.8h, v1.8h\n"
+      "fmin v15.8h, v15.8h, v0.8h\n"
+      "fmin v16.8h, v16.8h, v0.8h\n"
+      "fmin v17.8h, v17.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v1.8h\n"
+      "fmax v16.8h, v16.8h, v1.8h\n"
+      "fmax v17.8h, v17.8h, v1.8h\n"
+      "fmin v18.8h, v18.8h, v0.8h\n"
+      "fmin v19.8h, v19.8h, v0.8h\n"
+      "fmax v18.8h, v18.8h, v1.8h\n"
+      "fmax v19.8h, v19.8h, v1.8h\n"
+      "132:"  // Height 3: No activation
+      "cmp x16, #0x20\n"
+      "bge 149f\n"
+      "tbz x16, #4, 140f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v13.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v17.8h }, [x27], #0x10\n"
+      "tbz x16, #3, 136f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x9], #0x10\n"
+      "st1 { v18.8h }, [x27], #0x10\n"
+      "tbz x16, #2, 134f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "tbz x16, #1, 133f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x9], #0x4\n"
+      "st1 { v19.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x9]\n"
+      "st1 { v19.h }[6], [x27]\n"
+      "b 148f\n"
+      "133:"  // Height 3: Partial direct writeback: partial_1_28
+      "tbz x16, #0, 148f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x9]\n"
+      "st1 { v19.h }[4], [x27]\n"
+      "b 148f\n"
+      "134:"  // Height 3: Partial direct writeback: partial_2_24
+      "tbz x16, #1, 135f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x9], #0x4\n"
+      "str s19, [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x9]\n"
+      "st1 { v19.h }[2], [x27]\n"
+      "b 148f\n"
+      "135:"  // Height 3: Partial direct writeback: partial_1_24
+      "tbz x16, #0, 148f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x9, #0x0]\n"
+      "str h19, [x27, #0x0]\n"
+      "b 148f\n"
+      "136:"  // Height 3: Partial direct writeback: partial_4_16
+      "tbz x16, #2, 138f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "tbz x16, #1, 137f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x9], #0x4\n"
+      "st1 { v18.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x9]\n"
+      "st1 { v18.h }[6], [x27]\n"
+      "b 148f\n"
+      "137:"  // Height 3: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 148f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x9]\n"
+      "st1 { v18.h }[4], [x27]\n"
+      "b 148f\n"
+      "138:"  // Height 3: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 139f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x9], #0x4\n"
+      "str s18, [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x9]\n"
+      "st1 { v18.h }[2], [x27]\n"
+      "b 148f\n"
+      "139:"  // Height 3: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 148f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x9, #0x0]\n"
+      "str h18, [x27, #0x0]\n"
+      "b 148f\n"
+      "140:"  // Height 3: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 144f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "tbz x16, #2, 142f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "tbz x16, #1, 141f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x9], #0x4\n"
+      "st1 { v17.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x9]\n"
+      "st1 { v17.h }[6], [x27]\n"
+      "b 148f\n"
+      "141:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 148f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x9]\n"
+      "st1 { v17.h }[4], [x27]\n"
+      "b 148f\n"
+      "142:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 143f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x9], #0x4\n"
+      "str s17, [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x9]\n"
+      "st1 { v17.h }[2], [x27]\n"
+      "b 148f\n"
+      "143:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 148f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x9, #0x0]\n"
+      "str h17, [x27, #0x0]\n"
+      "b 148f\n"
+      "144:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 146f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "tbz x16, #1, 145f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x9]\n"
+      "st1 { v16.h }[6], [x27]\n"
+      "b 148f\n"
+      "145:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 148f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x9]\n"
+      "st1 { v16.h }[4], [x27]\n"
+      "b 148f\n"
+      "146:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 147f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "tbz x16, #0, 148f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x9]\n"
+      "st1 { v16.h }[2], [x27]\n"
+      "b 148f\n"
+      "147:"  // Height 3: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x9, #0x0]\n"
+      "str h16, [x27, #0x0]\n"
+      "148:"  // Height 3: Partial direct writeback: Done
+      "b 150f\n"
+      "149:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "150:"  // Height 3: Writeback done
+      "subs x16, x16, #0x20\n"
+      "bgt 103b\n"
+      "b 302f\n"
+      "151:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 152f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #1\n"
+      "add x25, x25, x19, LSL #1\n"
+      "b 153f\n"
+      "152:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "add x25, x27, x19, LSL #1\n"
+      "153:"  // Height 4: Column loop
+      "cbz x14, 154f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "add x14, x14, #0x40\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "b 173f\n"
+      "154:"  // Height 4: no bias
+      "tbz %x[flags], #0, 172f\n"
+      "cmp x16, #0x20\n"
+      "bge 171f\n"
+      "tbz x16, #4, 162f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v20.8h }, [x25], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x9], #0x10\n"
+      "ld1 { v17.8h }, [x27], #0x10\n"
+      "ld1 { v21.8h }, [x25], #0x10\n"
+      "tbz x16, #3, 158f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x9], #0x10\n"
+      "ld1 { v18.8h }, [x27], #0x10\n"
+      "ld1 { v22.8h }, [x25], #0x10\n"
+      "tbz x16, #2, 156f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "tbz x16, #1, 155f\n"
+      "mov x19, #0x3c\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x9], #0x4\n"
+      "ld1 { v19.s }[2], [x27], #0x4\n"
+      "ld1 { v23.s }[2], [x25], #0x4\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x9]\n"
+      "ld1 { v19.h }[6], [x27]\n"
+      "ld1 { v23.h }[6], [x25]\n"
+      "b 170f\n"
+      "155:"  // Height 4: Partial accumulate: partial_1_28
+      "mov x19, #0x38\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x9]\n"
+      "ld1 { v19.h }[4], [x27]\n"
+      "ld1 { v23.h }[4], [x25]\n"
+      "b 170f\n"
+      "156:"  // Height 4: Partial accumulate: partial_2_24
+      "tbz x16, #1, 157f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x9], #0x4\n"
+      "ldr s19, [x27], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
+      "mov x19, #0x34\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x9]\n"
+      "ld1 { v19.h }[2], [x27]\n"
+      "ld1 { v23.h }[2], [x25]\n"
+      "b 170f\n"
+      "157:"  // Height 4: Partial accumulate: partial_1_24
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 170f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x9, #0x0]\n"
+      "ldr h19, [x27, #0x0]\n"
+      "ldr h23, [x25, #0x0]\n"
+      "b 170f\n"
+      "158:"  // Height 4: Partial accumulate: partial_4_16
+      "tbz x16, #2, 160f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "tbz x16, #1, 159f\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x9], #0x4\n"
+      "ld1 { v18.s }[2], [x27], #0x4\n"
+      "ld1 { v22.s }[2], [x25], #0x4\n"
+      "mov x19, #0x2c\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x9]\n"
+      "ld1 { v18.h }[6], [x27]\n"
+      "ld1 { v22.h }[6], [x25]\n"
+      "b 170f\n"
+      "159:"  // Height 4: Partial accumulate: partial_1_20
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x9]\n"
+      "ld1 { v18.h }[4], [x27]\n"
+      "ld1 { v22.h }[4], [x25]\n"
+      "b 170f\n"
+      "160:"  // Height 4: Partial accumulate: partial_2_16
+      "tbz x16, #1, 161f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x9], #0x4\n"
+      "ldr s18, [x27], #0x4\n"
+      "ldr s22, [x25], #0x4\n"
+      "mov x19, #0x24\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x9]\n"
+      "ld1 { v18.h }[2], [x27]\n"
+      "ld1 { v22.h }[2], [x25]\n"
+      "b 170f\n"
+      "161:"  // Height 4: Partial accumulate: partial_1_16
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 170f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x9, #0x0]\n"
+      "ldr h18, [x27, #0x0]\n"
+      "ldr h22, [x25, #0x0]\n"
+      "b 170f\n"
+      "162:"  // Height 4: Partial accumulate: partial_8_0
+      "tbz x16, #3, 166f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v20.8h }, [x25], #0x10\n"
+      "tbz x16, #2, 164f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "tbz x16, #1, 163f\n"
+      "mov x19, #0x1c\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x9], #0x4\n"
+      "ld1 { v17.s }[2], [x27], #0x4\n"
+      "ld1 { v21.s }[2], [x25], #0x4\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x9]\n"
+      "ld1 { v17.h }[6], [x27]\n"
+      "ld1 { v21.h }[6], [x25]\n"
+      "b 170f\n"
+      "163:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x18\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x9]\n"
+      "ld1 { v17.h }[4], [x27]\n"
+      "ld1 { v21.h }[4], [x25]\n"
+      "b 170f\n"
+      "164:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x16, #1, 165f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x9], #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s21, [x25], #0x4\n"
+      "mov x19, #0x14\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x9]\n"
+      "ld1 { v17.h }[2], [x27]\n"
+      "ld1 { v21.h }[2], [x25]\n"
+      "b 170f\n"
+      "165:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 170f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x9, #0x0]\n"
+      "ldr h17, [x27, #0x0]\n"
+      "ldr h21, [x25, #0x0]\n"
+      "b 170f\n"
+      "166:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x16, #2, 168f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "tbz x16, #1, 167f\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x9], #0x4\n"
+      "ld1 { v16.s }[2], [x27], #0x4\n"
+      "ld1 { v20.s }[2], [x25], #0x4\n"
+      "mov x19, #0xc\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x9]\n"
+      "ld1 { v16.h }[6], [x27]\n"
+      "ld1 { v20.h }[6], [x25]\n"
+      "b 170f\n"
+      "167:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x9]\n"
+      "ld1 { v16.h }[4], [x27]\n"
+      "ld1 { v20.h }[4], [x25]\n"
+      "b 170f\n"
+      "168:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x16, #1, 169f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x9], #0x4\n"
+      "ldr s16, [x27], #0x4\n"
+      "ldr s20, [x25], #0x4\n"
+      "mov x19, #0x4\n"
+      "tbz x16, #0, 170f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x9]\n"
+      "ld1 { v16.h }[2], [x27]\n"
+      "ld1 { v20.h }[2], [x25]\n"
+      "b 170f\n"
+      "169:"  // Height 4: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x9, #0x0]\n"
+      "ldr h16, [x27, #0x0]\n"
+      "ldr h20, [x25, #0x0]\n"
+      "170:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "b 173f\n"
+      "171:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "b 173f\n"
+      "172:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "173:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "174:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 175f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 176f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "b 176f\n"
+      "175:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "176:"  // Height 4: input setup done
+      "cmp x11, #0x8\n"
+      "blt 179f\n"
+      "cmp x11, #0x10\n"
+      "blt 178f\n"
+      "177:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x11, x11, #0x8\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "cmp x11, #0x10\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "bge 177b\n"
+      "178:"  // Height 4: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "179:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 181f\n"
+      "180:"  // Height 4: Multiply loop: Odd block loop
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "cbnz x11, 180b\n"
+      "181:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 174b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 182f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.8h }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x19]\n"
+      "fmin v8.8h, v8.8h, v0.8h\n"
+      "fmin v9.8h, v9.8h, v0.8h\n"
+      "fmin v10.8h, v10.8h, v0.8h\n"
+      "fmin v11.8h, v11.8h, v0.8h\n"
+      "fmax v8.8h, v8.8h, v1.8h\n"
+      "fmax v9.8h, v9.8h, v1.8h\n"
+      "fmax v10.8h, v10.8h, v1.8h\n"
+      "fmax v11.8h, v11.8h, v1.8h\n"
+      "fmin v12.8h, v12.8h, v0.8h\n"
+      "fmin v13.8h, v13.8h, v0.8h\n"
+      "fmin v14.8h, v14.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v1.8h\n"
+      "fmax v13.8h, v13.8h, v1.8h\n"
+      "fmax v14.8h, v14.8h, v1.8h\n"
+      "fmin v15.8h, v15.8h, v0.8h\n"
+      "fmin v16.8h, v16.8h, v0.8h\n"
+      "fmin v17.8h, v17.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v1.8h\n"
+      "fmax v16.8h, v16.8h, v1.8h\n"
+      "fmax v17.8h, v17.8h, v1.8h\n"
+      "fmin v18.8h, v18.8h, v0.8h\n"
+      "fmin v19.8h, v19.8h, v0.8h\n"
+      "fmin v20.8h, v20.8h, v0.8h\n"
+      "fmax v18.8h, v18.8h, v1.8h\n"
+      "fmax v19.8h, v19.8h, v1.8h\n"
+      "fmax v20.8h, v20.8h, v1.8h\n"
+      "fmin v21.8h, v21.8h, v0.8h\n"
+      "fmin v22.8h, v22.8h, v0.8h\n"
+      "fmin v23.8h, v23.8h, v0.8h\n"
+      "fmax v21.8h, v21.8h, v1.8h\n"
+      "fmax v22.8h, v22.8h, v1.8h\n"
+      "fmax v23.8h, v23.8h, v1.8h\n"
+      "182:"  // Height 4: No activation
+      "cmp x16, #0x20\n"
+      "bge 199f\n"
+      "tbz x16, #4, 190f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v13.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v17.8h }, [x27], #0x10\n"
+      "st1 { v20.8h }, [x25], #0x10\n"
+      "st1 { v21.8h }, [x25], #0x10\n"
+      "tbz x16, #3, 186f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x9], #0x10\n"
+      "st1 { v18.8h }, [x27], #0x10\n"
+      "st1 { v22.8h }, [x25], #0x10\n"
+      "tbz x16, #2, 184f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "tbz x16, #1, 183f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x9], #0x4\n"
+      "st1 { v19.s }[2], [x27], #0x4\n"
+      "st1 { v23.s }[2], [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x9]\n"
+      "st1 { v19.h }[6], [x27]\n"
+      "st1 { v23.h }[6], [x25]\n"
+      "b 198f\n"
+      "183:"  // Height 4: Partial direct writeback: partial_1_28
+      "tbz x16, #0, 198f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x9]\n"
+      "st1 { v19.h }[4], [x27]\n"
+      "st1 { v23.h }[4], [x25]\n"
+      "b 198f\n"
+      "184:"  // Height 4: Partial direct writeback: partial_2_24
+      "tbz x16, #1, 185f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x9], #0x4\n"
+      "str s19, [x27], #0x4\n"
+      "str s23, [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x9]\n"
+      "st1 { v19.h }[2], [x27]\n"
+      "st1 { v23.h }[2], [x25]\n"
+      "b 198f\n"
+      "185:"  // Height 4: Partial direct writeback: partial_1_24
+      "tbz x16, #0, 198f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x9, #0x0]\n"
+      "str h19, [x27, #0x0]\n"
+      "str h23, [x25, #0x0]\n"
+      "b 198f\n"
+      "186:"  // Height 4: Partial direct writeback: partial_4_16
+      "tbz x16, #2, 188f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "tbz x16, #1, 187f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x9], #0x4\n"
+      "st1 { v18.s }[2], [x27], #0x4\n"
+      "st1 { v22.s }[2], [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x9]\n"
+      "st1 { v18.h }[6], [x27]\n"
+      "st1 { v22.h }[6], [x25]\n"
+      "b 198f\n"
+      "187:"  // Height 4: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 198f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x9]\n"
+      "st1 { v18.h }[4], [x27]\n"
+      "st1 { v22.h }[4], [x25]\n"
+      "b 198f\n"
+      "188:"  // Height 4: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 189f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x9], #0x4\n"
+      "str s18, [x27], #0x4\n"
+      "str s22, [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x9]\n"
+      "st1 { v18.h }[2], [x27]\n"
+      "st1 { v22.h }[2], [x25]\n"
+      "b 198f\n"
+      "189:"  // Height 4: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 198f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x9, #0x0]\n"
+      "str h18, [x27, #0x0]\n"
+      "str h22, [x25, #0x0]\n"
+      "b 198f\n"
+      "190:"  // Height 4: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 194f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v20.8h }, [x25], #0x10\n"
+      "tbz x16, #2, 192f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "tbz x16, #1, 191f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x9], #0x4\n"
+      "st1 { v17.s }[2], [x27], #0x4\n"
+      "st1 { v21.s }[2], [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x9]\n"
+      "st1 { v17.h }[6], [x27]\n"
+      "st1 { v21.h }[6], [x25]\n"
+      "b 198f\n"
+      "191:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 198f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x9]\n"
+      "st1 { v17.h }[4], [x27]\n"
+      "st1 { v21.h }[4], [x25]\n"
+      "b 198f\n"
+      "192:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 193f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x9], #0x4\n"
+      "str s17, [x27], #0x4\n"
+      "str s21, [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x9]\n"
+      "st1 { v17.h }[2], [x27]\n"
+      "st1 { v21.h }[2], [x25]\n"
+      "b 198f\n"
+      "193:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 198f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x9, #0x0]\n"
+      "str h17, [x27, #0x0]\n"
+      "str h21, [x25, #0x0]\n"
+      "b 198f\n"
+      "194:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 196f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x16, #1, 195f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x9]\n"
+      "st1 { v16.h }[6], [x27]\n"
+      "st1 { v20.h }[6], [x25]\n"
+      "b 198f\n"
+      "195:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 198f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x9]\n"
+      "st1 { v16.h }[4], [x27]\n"
+      "st1 { v20.h }[4], [x25]\n"
+      "b 198f\n"
+      "196:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 197f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "tbz x16, #0, 198f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x9]\n"
+      "st1 { v16.h }[2], [x27]\n"
+      "st1 { v20.h }[2], [x25]\n"
+      "b 198f\n"
+      "197:"  // Height 4: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x9, #0x0]\n"
+      "str h16, [x27, #0x0]\n"
+      "str h20, [x25, #0x0]\n"
+      "198:"  // Height 4: Partial direct writeback: Done
+      "b 200f\n"
+      "199:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "200:"  // Height 4: Writeback done
+      "subs x16, x16, #0x20\n"
+      "bgt 153b\n"
+      "b 302f\n"
+      "201:"  // Height 5
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 202f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #1\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "b 203f\n"
+      "202:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "add x25, x27, x19, LSL #1\n"
+      "add x23, x25, x19, LSL #1\n"
+      "203:"  // Height 5: Column loop
+      "cbz x14, 204f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v24.16b, v8.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "b 223f\n"
+      "204:"  // Height 5: no bias
+      "tbz %x[flags], #0, 222f\n"
+      "cmp x16, #0x20\n"
+      "bge 221f\n"
+      "tbz x16, #4, 212f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v20.8h }, [x25], #0x10\n"
+      "ld1 { v24.8h }, [x23], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x9], #0x10\n"
+      "ld1 { v17.8h }, [x27], #0x10\n"
+      "ld1 { v21.8h }, [x25], #0x10\n"
+      "ld1 { v25.8h }, [x23], #0x10\n"
+      "tbz x16, #3, 208f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x9], #0x10\n"
+      "ld1 { v18.8h }, [x27], #0x10\n"
+      "ld1 { v22.8h }, [x25], #0x10\n"
+      "ld1 { v26.8h }, [x23], #0x10\n"
+      "tbz x16, #2, 206f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "tbz x16, #1, 205f\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x9], #0x4\n"
+      "ld1 { v19.s }[2], [x27], #0x4\n"
+      "ld1 { v23.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x23], #0x4\n"
+      "mov x19, #0x3c\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x9]\n"
+      "ld1 { v19.h }[6], [x27]\n"
+      "ld1 { v23.h }[6], [x25]\n"
+      "ld1 { v27.h }[6], [x23]\n"
+      "b 220f\n"
+      "205:"  // Height 5: Partial accumulate: partial_1_28
+      "mov x19, #0x38\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x9]\n"
+      "ld1 { v19.h }[4], [x27]\n"
+      "ld1 { v23.h }[4], [x25]\n"
+      "ld1 { v27.h }[4], [x23]\n"
+      "b 220f\n"
+      "206:"  // Height 5: Partial accumulate: partial_2_24
+      "tbz x16, #1, 207f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x9], #0x4\n"
+      "ldr s19, [x27], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
+      "ldr s27, [x23], #0x4\n"
+      "mov x19, #0x34\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x9]\n"
+      "ld1 { v19.h }[2], [x27]\n"
+      "ld1 { v23.h }[2], [x25]\n"
+      "ld1 { v27.h }[2], [x23]\n"
+      "b 220f\n"
+      "207:"  // Height 5: Partial accumulate: partial_1_24
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 220f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x9, #0x0]\n"
+      "ldr h19, [x27, #0x0]\n"
+      "ldr h23, [x25, #0x0]\n"
+      "ldr h27, [x23, #0x0]\n"
+      "b 220f\n"
+      "208:"  // Height 5: Partial accumulate: partial_4_16
+      "tbz x16, #2, 210f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "tbz x16, #1, 209f\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x9], #0x4\n"
+      "ld1 { v18.s }[2], [x27], #0x4\n"
+      "ld1 { v22.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x23], #0x4\n"
+      "mov x19, #0x2c\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x9]\n"
+      "ld1 { v18.h }[6], [x27]\n"
+      "ld1 { v22.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x23]\n"
+      "b 220f\n"
+      "209:"  // Height 5: Partial accumulate: partial_1_20
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x9]\n"
+      "ld1 { v18.h }[4], [x27]\n"
+      "ld1 { v22.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x23]\n"
+      "b 220f\n"
+      "210:"  // Height 5: Partial accumulate: partial_2_16
+      "tbz x16, #1, 211f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x9], #0x4\n"
+      "ldr s18, [x27], #0x4\n"
+      "ldr s22, [x25], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "mov x19, #0x24\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x9]\n"
+      "ld1 { v18.h }[2], [x27]\n"
+      "ld1 { v22.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x23]\n"
+      "b 220f\n"
+      "211:"  // Height 5: Partial accumulate: partial_1_16
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 220f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x9, #0x0]\n"
+      "ldr h18, [x27, #0x0]\n"
+      "ldr h22, [x25, #0x0]\n"
+      "ldr h26, [x23, #0x0]\n"
+      "b 220f\n"
+      "212:"  // Height 5: Partial accumulate: partial_8_0
+      "tbz x16, #3, 216f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v20.8h }, [x25], #0x10\n"
+      "ld1 { v24.8h }, [x23], #0x10\n"
+      "tbz x16, #2, 214f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "tbz x16, #1, 213f\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x9], #0x4\n"
+      "ld1 { v17.s }[2], [x27], #0x4\n"
+      "ld1 { v21.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
+      "mov x19, #0x1c\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x9]\n"
+      "ld1 { v17.h }[6], [x27]\n"
+      "ld1 { v21.h }[6], [x25]\n"
+      "ld1 { v25.h }[6], [x23]\n"
+      "b 220f\n"
+      "213:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x19, #0x18\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x9]\n"
+      "ld1 { v17.h }[4], [x27]\n"
+      "ld1 { v21.h }[4], [x25]\n"
+      "ld1 { v25.h }[4], [x23]\n"
+      "b 220f\n"
+      "214:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x16, #1, 215f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x9], #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s21, [x25], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "mov x19, #0x14\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x9]\n"
+      "ld1 { v17.h }[2], [x27]\n"
+      "ld1 { v21.h }[2], [x25]\n"
+      "ld1 { v25.h }[2], [x23]\n"
+      "b 220f\n"
+      "215:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 220f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x9, #0x0]\n"
+      "ldr h17, [x27, #0x0]\n"
+      "ldr h21, [x25, #0x0]\n"
+      "ldr h25, [x23, #0x0]\n"
+      "b 220f\n"
+      "216:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x16, #2, 218f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "tbz x16, #1, 217f\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x9], #0x4\n"
+      "ld1 { v16.s }[2], [x27], #0x4\n"
+      "ld1 { v20.s }[2], [x25], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "mov x19, #0xc\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x9]\n"
+      "ld1 { v16.h }[6], [x27]\n"
+      "ld1 { v20.h }[6], [x25]\n"
+      "ld1 { v24.h }[6], [x23]\n"
+      "b 220f\n"
+      "217:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x9]\n"
+      "ld1 { v16.h }[4], [x27]\n"
+      "ld1 { v20.h }[4], [x25]\n"
+      "ld1 { v24.h }[4], [x23]\n"
+      "b 220f\n"
+      "218:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x16, #1, 219f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x9], #0x4\n"
+      "ldr s16, [x27], #0x4\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "mov x19, #0x4\n"
+      "tbz x16, #0, 220f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x9]\n"
+      "ld1 { v16.h }[2], [x27]\n"
+      "ld1 { v20.h }[2], [x25]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "b 220f\n"
+      "219:"  // Height 5: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x9, #0x0]\n"
+      "ldr h16, [x27, #0x0]\n"
+      "ldr h20, [x25, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "220:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "b 223f\n"
+      "221:"  // Height 5: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "b 223f\n"
+      "222:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "223:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "224:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 225f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 226f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "b 226f\n"
+      "225:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "226:"  // Height 5: input setup done
+      "cmp x11, #0x8\n"
+      "blt 229f\n"
+      "cmp x11, #0x10\n"
+      "blt 228f\n"
+      "227:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x11, x11, #0x8\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "cmp x11, #0x10\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "fmla v24.8h, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "fmla v25.8h, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "fmla v26.8h, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "fmla v27.8h, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "fmla v24.8h, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "fmla v25.8h, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "fmla v26.8h, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "fmla v27.8h, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "fmla v24.8h, v6.8h, v4.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "fmla v25.8h, v7.8h, v4.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "fmla v26.8h, v6.8h, v4.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "fmla v27.8h, v7.8h, v4.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "fmla v24.8h, v6.8h, v4.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "fmla v25.8h, v7.8h, v4.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "fmla v26.8h, v6.8h, v4.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "fmla v27.8h, v7.8h, v4.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "fmla v24.8h, v6.8h, v4.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "fmla v25.8h, v7.8h, v4.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "fmla v26.8h, v6.8h, v4.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "fmla v27.8h, v7.8h, v4.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "fmla v24.8h, v6.8h, v4.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "fmla v25.8h, v7.8h, v4.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "fmla v26.8h, v6.8h, v4.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "fmla v27.8h, v7.8h, v4.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "fmla v24.8h, v6.8h, v4.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "fmla v25.8h, v7.8h, v4.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "bge 227b\n"
+      "228:"  // Height 5: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "fmla v24.8h, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "fmla v25.8h, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "fmla v26.8h, v6.8h, v4.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "fmla v27.8h, v7.8h, v4.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "fmla v24.8h, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "fmla v25.8h, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "fmla v26.8h, v6.8h, v4.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "fmla v27.8h, v7.8h, v4.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "fmla v24.8h, v6.8h, v4.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "fmla v25.8h, v7.8h, v4.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "fmla v26.8h, v6.8h, v4.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "fmla v27.8h, v7.8h, v4.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "fmla v24.8h, v6.8h, v4.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "fmla v25.8h, v7.8h, v4.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "fmla v26.8h, v6.8h, v4.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "fmla v27.8h, v7.8h, v4.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "fmla v24.8h, v6.8h, v4.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "fmla v25.8h, v7.8h, v4.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "fmla v26.8h, v6.8h, v4.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "fmla v27.8h, v7.8h, v4.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "fmla v24.8h, v6.8h, v4.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "fmla v25.8h, v7.8h, v4.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "fmla v26.8h, v6.8h, v4.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "fmla v27.8h, v7.8h, v4.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "fmla v24.8h, v6.8h, v4.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "fmla v25.8h, v7.8h, v4.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "229:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x11, 231f\n"
+      "230:"  // Height 5: Multiply loop: Odd block loop
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "cbnz x11, 230b\n"
+      "231:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 224b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 232f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.8h }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x19]\n"
+      "fmin v8.8h, v8.8h, v0.8h\n"
+      "fmin v9.8h, v9.8h, v0.8h\n"
+      "fmin v10.8h, v10.8h, v0.8h\n"
+      "fmin v11.8h, v11.8h, v0.8h\n"
+      "fmax v8.8h, v8.8h, v1.8h\n"
+      "fmax v9.8h, v9.8h, v1.8h\n"
+      "fmax v10.8h, v10.8h, v1.8h\n"
+      "fmax v11.8h, v11.8h, v1.8h\n"
+      "fmin v12.8h, v12.8h, v0.8h\n"
+      "fmin v13.8h, v13.8h, v0.8h\n"
+      "fmin v14.8h, v14.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v1.8h\n"
+      "fmax v13.8h, v13.8h, v1.8h\n"
+      "fmax v14.8h, v14.8h, v1.8h\n"
+      "fmin v15.8h, v15.8h, v0.8h\n"
+      "fmin v16.8h, v16.8h, v0.8h\n"
+      "fmin v17.8h, v17.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v1.8h\n"
+      "fmax v16.8h, v16.8h, v1.8h\n"
+      "fmax v17.8h, v17.8h, v1.8h\n"
+      "fmin v18.8h, v18.8h, v0.8h\n"
+      "fmin v19.8h, v19.8h, v0.8h\n"
+      "fmin v20.8h, v20.8h, v0.8h\n"
+      "fmax v18.8h, v18.8h, v1.8h\n"
+      "fmax v19.8h, v19.8h, v1.8h\n"
+      "fmax v20.8h, v20.8h, v1.8h\n"
+      "fmin v21.8h, v21.8h, v0.8h\n"
+      "fmin v22.8h, v22.8h, v0.8h\n"
+      "fmin v23.8h, v23.8h, v0.8h\n"
+      "fmax v21.8h, v21.8h, v1.8h\n"
+      "fmax v22.8h, v22.8h, v1.8h\n"
+      "fmax v23.8h, v23.8h, v1.8h\n"
+      "fmin v24.8h, v24.8h, v0.8h\n"
+      "fmin v25.8h, v25.8h, v0.8h\n"
+      "fmin v26.8h, v26.8h, v0.8h\n"
+      "fmax v24.8h, v24.8h, v1.8h\n"
+      "fmax v25.8h, v25.8h, v1.8h\n"
+      "fmax v26.8h, v26.8h, v1.8h\n"
+      "fmin v27.8h, v27.8h, v0.8h\n"
+      "fmax v27.8h, v27.8h, v1.8h\n"
+      "232:"  // Height 5: No activation
+      "cmp x16, #0x20\n"
+      "bge 249f\n"
+      "tbz x16, #4, 240f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v13.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v17.8h }, [x27], #0x10\n"
+      "st1 { v20.8h }, [x25], #0x10\n"
+      "st1 { v21.8h }, [x25], #0x10\n"
+      "st1 { v24.8h }, [x23], #0x10\n"
+      "st1 { v25.8h }, [x23], #0x10\n"
+      "tbz x16, #3, 236f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x9], #0x10\n"
+      "st1 { v18.8h }, [x27], #0x10\n"
+      "st1 { v22.8h }, [x25], #0x10\n"
+      "st1 { v26.8h }, [x23], #0x10\n"
+      "tbz x16, #2, 234f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "tbz x16, #1, 233f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x9], #0x4\n"
+      "st1 { v19.s }[2], [x27], #0x4\n"
+      "st1 { v23.s }[2], [x25], #0x4\n"
+      "st1 { v27.s }[2], [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x9]\n"
+      "st1 { v19.h }[6], [x27]\n"
+      "st1 { v23.h }[6], [x25]\n"
+      "st1 { v27.h }[6], [x23]\n"
+      "b 248f\n"
+      "233:"  // Height 5: Partial direct writeback: partial_1_28
+      "tbz x16, #0, 248f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x9]\n"
+      "st1 { v19.h }[4], [x27]\n"
+      "st1 { v23.h }[4], [x25]\n"
+      "st1 { v27.h }[4], [x23]\n"
+      "b 248f\n"
+      "234:"  // Height 5: Partial direct writeback: partial_2_24
+      "tbz x16, #1, 235f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x9], #0x4\n"
+      "str s19, [x27], #0x4\n"
+      "str s23, [x25], #0x4\n"
+      "str s27, [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x9]\n"
+      "st1 { v19.h }[2], [x27]\n"
+      "st1 { v23.h }[2], [x25]\n"
+      "st1 { v27.h }[2], [x23]\n"
+      "b 248f\n"
+      "235:"  // Height 5: Partial direct writeback: partial_1_24
+      "tbz x16, #0, 248f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x9, #0x0]\n"
+      "str h19, [x27, #0x0]\n"
+      "str h23, [x25, #0x0]\n"
+      "str h27, [x23, #0x0]\n"
+      "b 248f\n"
+      "236:"  // Height 5: Partial direct writeback: partial_4_16
+      "tbz x16, #2, 238f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "tbz x16, #1, 237f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x9], #0x4\n"
+      "st1 { v18.s }[2], [x27], #0x4\n"
+      "st1 { v22.s }[2], [x25], #0x4\n"
+      "st1 { v26.s }[2], [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x9]\n"
+      "st1 { v18.h }[6], [x27]\n"
+      "st1 { v22.h }[6], [x25]\n"
+      "st1 { v26.h }[6], [x23]\n"
+      "b 248f\n"
+      "237:"  // Height 5: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 248f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x9]\n"
+      "st1 { v18.h }[4], [x27]\n"
+      "st1 { v22.h }[4], [x25]\n"
+      "st1 { v26.h }[4], [x23]\n"
+      "b 248f\n"
+      "238:"  // Height 5: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 239f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x9], #0x4\n"
+      "str s18, [x27], #0x4\n"
+      "str s22, [x25], #0x4\n"
+      "str s26, [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x9]\n"
+      "st1 { v18.h }[2], [x27]\n"
+      "st1 { v22.h }[2], [x25]\n"
+      "st1 { v26.h }[2], [x23]\n"
+      "b 248f\n"
+      "239:"  // Height 5: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 248f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x9, #0x0]\n"
+      "str h18, [x27, #0x0]\n"
+      "str h22, [x25, #0x0]\n"
+      "str h26, [x23, #0x0]\n"
+      "b 248f\n"
+      "240:"  // Height 5: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 244f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v20.8h }, [x25], #0x10\n"
+      "st1 { v24.8h }, [x23], #0x10\n"
+      "tbz x16, #2, 242f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "tbz x16, #1, 241f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x9], #0x4\n"
+      "st1 { v17.s }[2], [x27], #0x4\n"
+      "st1 { v21.s }[2], [x25], #0x4\n"
+      "st1 { v25.s }[2], [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x9]\n"
+      "st1 { v17.h }[6], [x27]\n"
+      "st1 { v21.h }[6], [x25]\n"
+      "st1 { v25.h }[6], [x23]\n"
+      "b 248f\n"
+      "241:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 248f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x9]\n"
+      "st1 { v17.h }[4], [x27]\n"
+      "st1 { v21.h }[4], [x25]\n"
+      "st1 { v25.h }[4], [x23]\n"
+      "b 248f\n"
+      "242:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 243f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x9], #0x4\n"
+      "str s17, [x27], #0x4\n"
+      "str s21, [x25], #0x4\n"
+      "str s25, [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x9]\n"
+      "st1 { v17.h }[2], [x27]\n"
+      "st1 { v21.h }[2], [x25]\n"
+      "st1 { v25.h }[2], [x23]\n"
+      "b 248f\n"
+      "243:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 248f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x9, #0x0]\n"
+      "str h17, [x27, #0x0]\n"
+      "str h21, [x25, #0x0]\n"
+      "str h25, [x23, #0x0]\n"
+      "b 248f\n"
+      "244:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 246f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x16, #1, 245f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x9]\n"
+      "st1 { v16.h }[6], [x27]\n"
+      "st1 { v20.h }[6], [x25]\n"
+      "st1 { v24.h }[6], [x23]\n"
+      "b 248f\n"
+      "245:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 248f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x9]\n"
+      "st1 { v16.h }[4], [x27]\n"
+      "st1 { v20.h }[4], [x25]\n"
+      "st1 { v24.h }[4], [x23]\n"
+      "b 248f\n"
+      "246:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 247f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "tbz x16, #0, 248f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x9]\n"
+      "st1 { v16.h }[2], [x27]\n"
+      "st1 { v20.h }[2], [x25]\n"
+      "st1 { v24.h }[2], [x23]\n"
+      "b 248f\n"
+      "247:"  // Height 5: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x9, #0x0]\n"
+      "str h16, [x27, #0x0]\n"
+      "str h20, [x25, #0x0]\n"
+      "str h24, [x23, #0x0]\n"
+      "248:"  // Height 5: Partial direct writeback: Done
+      "b 250f\n"
+      "249:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "250:"  // Height 5: Writeback done
+      "subs x16, x16, #0x20\n"
+      "bgt 203b\n"
+      "b 302f\n"
+      "251:"  // Height 6
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 252f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #1\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x21, x21, x19, LSL #1\n"
+      "b 253f\n"
+      "252:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "add x25, x27, x19, LSL #1\n"
+      "add x23, x25, x19, LSL #1\n"
+      "add x21, x23, x19, LSL #1\n"
+      "add %x[output_ptr], x21, x19, LSL #1\n"
+      "253:"  // Height 6: Column loop
+      "cbz x14, 254f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v24.16b, v8.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v28.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v29.16b, v9.16b\n"
+      "mov v30.16b, v10.16b\n"
+      "mov v31.16b, v11.16b\n"
+      "b 273f\n"
+      "254:"  // Height 6: no bias
+      "tbz %x[flags], #0, 272f\n"
+      "cmp x16, #0x20\n"
+      "bge 271f\n"
+      "tbz x16, #4, 262f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v20.8h }, [x25], #0x10\n"
+      "ld1 { v24.8h }, [x23], #0x10\n"
+      "ld1 { v28.8h }, [x21], #0x10\n"
+      "ld1 { v9.8h }, [x13], #0x10\n"
+      "ld1 { v13.8h }, [x9], #0x10\n"
+      "ld1 { v17.8h }, [x27], #0x10\n"
+      "ld1 { v21.8h }, [x25], #0x10\n"
+      "ld1 { v25.8h }, [x23], #0x10\n"
+      "ld1 { v29.8h }, [x21], #0x10\n"
+      "tbz x16, #3, 258f\n"
+      "ld1 { v10.8h }, [x13], #0x10\n"
+      "ld1 { v14.8h }, [x9], #0x10\n"
+      "ld1 { v18.8h }, [x27], #0x10\n"
+      "ld1 { v22.8h }, [x25], #0x10\n"
+      "ld1 { v26.8h }, [x23], #0x10\n"
+      "ld1 { v30.8h }, [x21], #0x10\n"
+      "tbz x16, #2, 256f\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x16, #1, 255f\n"
+      "ld1 { v11.s }[2], [x13], #0x4\n"
+      "ld1 { v15.s }[2], [x9], #0x4\n"
+      "ld1 { v19.s }[2], [x27], #0x4\n"
+      "ld1 { v23.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x23], #0x4\n"
+      "ld1 { v31.s }[2], [x21], #0x4\n"
+      "mov x19, #0x3c\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v11.h }[6], [x13]\n"
+      "ld1 { v15.h }[6], [x9]\n"
+      "ld1 { v19.h }[6], [x27]\n"
+      "ld1 { v23.h }[6], [x25]\n"
+      "ld1 { v27.h }[6], [x23]\n"
+      "ld1 { v31.h }[6], [x21]\n"
+      "b 270f\n"
+      "255:"  // Height 6: Partial accumulate: partial_1_28
+      "mov x19, #0x38\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v11.h }[4], [x13]\n"
+      "ld1 { v15.h }[4], [x9]\n"
+      "ld1 { v19.h }[4], [x27]\n"
+      "ld1 { v23.h }[4], [x25]\n"
+      "ld1 { v27.h }[4], [x23]\n"
+      "ld1 { v31.h }[4], [x21]\n"
+      "b 270f\n"
+      "256:"  // Height 6: Partial accumulate: partial_2_24
+      "tbz x16, #1, 257f\n"
+      "ldr s11, [x13], #0x4\n"
+      "ldr s15, [x9], #0x4\n"
+      "ldr s19, [x27], #0x4\n"
+      "ldr s23, [x25], #0x4\n"
+      "ldr s27, [x23], #0x4\n"
+      "ldr s31, [x21], #0x4\n"
+      "mov x19, #0x34\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v11.h }[2], [x13]\n"
+      "ld1 { v15.h }[2], [x9]\n"
+      "ld1 { v19.h }[2], [x27]\n"
+      "ld1 { v23.h }[2], [x25]\n"
+      "ld1 { v27.h }[2], [x23]\n"
+      "ld1 { v31.h }[2], [x21]\n"
+      "b 270f\n"
+      "257:"  // Height 6: Partial accumulate: partial_1_24
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 270f\n"
+      "ldr h11, [x13, #0x0]\n"
+      "ldr h15, [x9, #0x0]\n"
+      "ldr h19, [x27, #0x0]\n"
+      "ldr h23, [x25, #0x0]\n"
+      "ldr h27, [x23, #0x0]\n"
+      "ldr h31, [x21, #0x0]\n"
+      "b 270f\n"
+      "258:"  // Height 6: Partial accumulate: partial_4_16
+      "tbz x16, #2, 260f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "tbz x16, #1, 259f\n"
+      "ld1 { v10.s }[2], [x13], #0x4\n"
+      "ld1 { v14.s }[2], [x9], #0x4\n"
+      "ld1 { v18.s }[2], [x27], #0x4\n"
+      "ld1 { v22.s }[2], [x25], #0x4\n"
+      "ld1 { v26.s }[2], [x23], #0x4\n"
+      "ld1 { v30.s }[2], [x21], #0x4\n"
+      "mov x19, #0x2c\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v10.h }[6], [x13]\n"
+      "ld1 { v14.h }[6], [x9]\n"
+      "ld1 { v18.h }[6], [x27]\n"
+      "ld1 { v22.h }[6], [x25]\n"
+      "ld1 { v26.h }[6], [x23]\n"
+      "ld1 { v30.h }[6], [x21]\n"
+      "b 270f\n"
+      "259:"  // Height 6: Partial accumulate: partial_1_20
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v10.h }[4], [x13]\n"
+      "ld1 { v14.h }[4], [x9]\n"
+      "ld1 { v18.h }[4], [x27]\n"
+      "ld1 { v22.h }[4], [x25]\n"
+      "ld1 { v26.h }[4], [x23]\n"
+      "ld1 { v30.h }[4], [x21]\n"
+      "b 270f\n"
+      "260:"  // Height 6: Partial accumulate: partial_2_16
+      "tbz x16, #1, 261f\n"
+      "ldr s10, [x13], #0x4\n"
+      "ldr s14, [x9], #0x4\n"
+      "ldr s18, [x27], #0x4\n"
+      "ldr s22, [x25], #0x4\n"
+      "ldr s26, [x23], #0x4\n"
+      "ldr s30, [x21], #0x4\n"
+      "mov x19, #0x24\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v10.h }[2], [x13]\n"
+      "ld1 { v14.h }[2], [x9]\n"
+      "ld1 { v18.h }[2], [x27]\n"
+      "ld1 { v22.h }[2], [x25]\n"
+      "ld1 { v26.h }[2], [x23]\n"
+      "ld1 { v30.h }[2], [x21]\n"
+      "b 270f\n"
+      "261:"  // Height 6: Partial accumulate: partial_1_16
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 270f\n"
+      "ldr h10, [x13, #0x0]\n"
+      "ldr h14, [x9, #0x0]\n"
+      "ldr h18, [x27, #0x0]\n"
+      "ldr h22, [x25, #0x0]\n"
+      "ldr h26, [x23, #0x0]\n"
+      "ldr h30, [x21, #0x0]\n"
+      "b 270f\n"
+      "262:"  // Height 6: Partial accumulate: partial_8_0
+      "tbz x16, #3, 266f\n"
+      "ld1 { v8.8h }, [x13], #0x10\n"
+      "ld1 { v12.8h }, [x9], #0x10\n"
+      "ld1 { v16.8h }, [x27], #0x10\n"
+      "ld1 { v20.8h }, [x25], #0x10\n"
+      "ld1 { v24.8h }, [x23], #0x10\n"
+      "ld1 { v28.8h }, [x21], #0x10\n"
+      "tbz x16, #2, 264f\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x16, #1, 263f\n"
+      "ld1 { v9.s }[2], [x13], #0x4\n"
+      "ld1 { v13.s }[2], [x9], #0x4\n"
+      "ld1 { v17.s }[2], [x27], #0x4\n"
+      "ld1 { v21.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x23], #0x4\n"
+      "ld1 { v29.s }[2], [x21], #0x4\n"
+      "mov x19, #0x1c\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v9.h }[6], [x13]\n"
+      "ld1 { v13.h }[6], [x9]\n"
+      "ld1 { v17.h }[6], [x27]\n"
+      "ld1 { v21.h }[6], [x25]\n"
+      "ld1 { v25.h }[6], [x23]\n"
+      "ld1 { v29.h }[6], [x21]\n"
+      "b 270f\n"
+      "263:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x19, #0x18\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v9.h }[4], [x13]\n"
+      "ld1 { v13.h }[4], [x9]\n"
+      "ld1 { v17.h }[4], [x27]\n"
+      "ld1 { v21.h }[4], [x25]\n"
+      "ld1 { v25.h }[4], [x23]\n"
+      "ld1 { v29.h }[4], [x21]\n"
+      "b 270f\n"
+      "264:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x16, #1, 265f\n"
+      "ldr s9, [x13], #0x4\n"
+      "ldr s13, [x9], #0x4\n"
+      "ldr s17, [x27], #0x4\n"
+      "ldr s21, [x25], #0x4\n"
+      "ldr s25, [x23], #0x4\n"
+      "ldr s29, [x21], #0x4\n"
+      "mov x19, #0x14\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v9.h }[2], [x13]\n"
+      "ld1 { v13.h }[2], [x9]\n"
+      "ld1 { v17.h }[2], [x27]\n"
+      "ld1 { v21.h }[2], [x25]\n"
+      "ld1 { v25.h }[2], [x23]\n"
+      "ld1 { v29.h }[2], [x21]\n"
+      "b 270f\n"
+      "265:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 270f\n"
+      "ldr h9, [x13, #0x0]\n"
+      "ldr h13, [x9, #0x0]\n"
+      "ldr h17, [x27, #0x0]\n"
+      "ldr h21, [x25, #0x0]\n"
+      "ldr h25, [x23, #0x0]\n"
+      "ldr h29, [x21, #0x0]\n"
+      "b 270f\n"
+      "266:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x16, #2, 268f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "tbz x16, #1, 267f\n"
+      "ld1 { v8.s }[2], [x13], #0x4\n"
+      "ld1 { v12.s }[2], [x9], #0x4\n"
+      "ld1 { v16.s }[2], [x27], #0x4\n"
+      "ld1 { v20.s }[2], [x25], #0x4\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ld1 { v28.s }[2], [x21], #0x4\n"
+      "mov x19, #0xc\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v8.h }[6], [x13]\n"
+      "ld1 { v12.h }[6], [x9]\n"
+      "ld1 { v16.h }[6], [x27]\n"
+      "ld1 { v20.h }[6], [x25]\n"
+      "ld1 { v24.h }[6], [x23]\n"
+      "ld1 { v28.h }[6], [x21]\n"
+      "b 270f\n"
+      "267:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v8.h }[4], [x13]\n"
+      "ld1 { v12.h }[4], [x9]\n"
+      "ld1 { v16.h }[4], [x27]\n"
+      "ld1 { v20.h }[4], [x25]\n"
+      "ld1 { v24.h }[4], [x23]\n"
+      "ld1 { v28.h }[4], [x21]\n"
+      "b 270f\n"
+      "268:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x16, #1, 269f\n"
+      "ldr s8, [x13], #0x4\n"
+      "ldr s12, [x9], #0x4\n"
+      "ldr s16, [x27], #0x4\n"
+      "ldr s20, [x25], #0x4\n"
+      "ldr s24, [x23], #0x4\n"
+      "ldr s28, [x21], #0x4\n"
+      "mov x19, #0x4\n"
+      "tbz x16, #0, 270f\n"
+      "ld1 { v8.h }[2], [x13]\n"
+      "ld1 { v12.h }[2], [x9]\n"
+      "ld1 { v16.h }[2], [x27]\n"
+      "ld1 { v20.h }[2], [x25]\n"
+      "ld1 { v24.h }[2], [x23]\n"
+      "ld1 { v28.h }[2], [x21]\n"
+      "b 270f\n"
+      "269:"  // Height 6: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr h8, [x13, #0x0]\n"
+      "ldr h12, [x9, #0x0]\n"
+      "ldr h16, [x27, #0x0]\n"
+      "ldr h20, [x25, #0x0]\n"
+      "ldr h24, [x23, #0x0]\n"
+      "ldr h28, [x21, #0x0]\n"
+      "270:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "sub x21, x21, x19\n"
+      "b 273f\n"
+      "271:"  // Height 6: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "b 273f\n"
+      "272:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "273:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "274:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 275f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 276f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x20, x20, x19, LSL #1\n"
+      "b 276f\n"
+      "275:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "add x20, x22, x19, LSL #1\n"
+      "276:"  // Height 6: input setup done
+      "cmp x11, #0x8\n"
+      "blt 279f\n"
+      "cmp x11, #0x10\n"
+      "blt 278f\n"
+      "277:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v28.8h, v6.8h, v5.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x11, x11, #0x8\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "cmp x11, #0x10\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "fmla v29.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "fmla v30.8h, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "fmla v31.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "fmla v24.8h, v6.8h, v4.h[1]\n"
+      "fmla v28.8h, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "fmla v25.8h, v7.8h, v4.h[1]\n"
+      "fmla v29.8h, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "fmla v26.8h, v6.8h, v4.h[1]\n"
+      "fmla v30.8h, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "fmla v27.8h, v7.8h, v4.h[1]\n"
+      "fmla v31.8h, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "fmla v24.8h, v6.8h, v4.h[2]\n"
+      "fmla v28.8h, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "fmla v25.8h, v7.8h, v4.h[2]\n"
+      "fmla v29.8h, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "fmla v26.8h, v6.8h, v4.h[2]\n"
+      "fmla v30.8h, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "fmla v27.8h, v7.8h, v4.h[2]\n"
+      "fmla v31.8h, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "fmla v24.8h, v6.8h, v4.h[3]\n"
+      "fmla v28.8h, v6.8h, v5.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "fmla v25.8h, v7.8h, v4.h[3]\n"
+      "fmla v29.8h, v7.8h, v5.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "fmla v26.8h, v6.8h, v4.h[3]\n"
+      "fmla v30.8h, v6.8h, v5.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "fmla v27.8h, v7.8h, v4.h[3]\n"
+      "fmla v31.8h, v7.8h, v5.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "fmla v24.8h, v6.8h, v4.h[4]\n"
+      "fmla v28.8h, v6.8h, v5.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "fmla v25.8h, v7.8h, v4.h[4]\n"
+      "fmla v29.8h, v7.8h, v5.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "fmla v26.8h, v6.8h, v4.h[4]\n"
+      "fmla v30.8h, v6.8h, v5.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "fmla v27.8h, v7.8h, v4.h[4]\n"
+      "fmla v31.8h, v7.8h, v5.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "fmla v24.8h, v6.8h, v4.h[5]\n"
+      "fmla v28.8h, v6.8h, v5.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "fmla v25.8h, v7.8h, v4.h[5]\n"
+      "fmla v29.8h, v7.8h, v5.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "fmla v26.8h, v6.8h, v4.h[5]\n"
+      "fmla v30.8h, v6.8h, v5.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "fmla v27.8h, v7.8h, v4.h[5]\n"
+      "fmla v31.8h, v7.8h, v5.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "fmla v24.8h, v6.8h, v4.h[6]\n"
+      "fmla v28.8h, v6.8h, v5.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "fmla v25.8h, v7.8h, v4.h[6]\n"
+      "fmla v29.8h, v7.8h, v5.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "fmla v26.8h, v6.8h, v4.h[6]\n"
+      "fmla v30.8h, v6.8h, v5.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "fmla v27.8h, v7.8h, v4.h[6]\n"
+      "fmla v31.8h, v7.8h, v5.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "fmla v24.8h, v6.8h, v4.h[7]\n"
+      "fmla v28.8h, v6.8h, v5.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "fmla v25.8h, v7.8h, v4.h[7]\n"
+      "fmla v29.8h, v7.8h, v5.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v30.8h, v6.8h, v5.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "fmla v31.8h, v7.8h, v5.h[7]\n"
+      "bge 277b\n"
+      "278:"  // Height 6: Multiply loop: Single iteration only
+      "sub x11, x11, #0x8\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v28.8h, v6.8h, v5.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "fmla v29.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "fmla v30.8h, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "fmla v31.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.8h, v6.8h, v0.h[1]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v16.8h, v6.8h, v2.h[1]\n"
+      "fmla v20.8h, v6.8h, v3.h[1]\n"
+      "fmla v24.8h, v6.8h, v4.h[1]\n"
+      "fmla v28.8h, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.8h, v7.8h, v0.h[1]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v17.8h, v7.8h, v2.h[1]\n"
+      "fmla v21.8h, v7.8h, v3.h[1]\n"
+      "fmla v25.8h, v7.8h, v4.h[1]\n"
+      "fmla v29.8h, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.8h, v6.8h, v0.h[1]\n"
+      "fmla v14.8h, v6.8h, v1.h[1]\n"
+      "fmla v18.8h, v6.8h, v2.h[1]\n"
+      "fmla v22.8h, v6.8h, v3.h[1]\n"
+      "fmla v26.8h, v6.8h, v4.h[1]\n"
+      "fmla v30.8h, v6.8h, v5.h[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.8h, v7.8h, v0.h[1]\n"
+      "fmla v15.8h, v7.8h, v1.h[1]\n"
+      "fmla v19.8h, v7.8h, v2.h[1]\n"
+      "fmla v23.8h, v7.8h, v3.h[1]\n"
+      "fmla v27.8h, v7.8h, v4.h[1]\n"
+      "fmla v31.8h, v7.8h, v5.h[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.8h, v6.8h, v0.h[2]\n"
+      "fmla v12.8h, v6.8h, v1.h[2]\n"
+      "fmla v16.8h, v6.8h, v2.h[2]\n"
+      "fmla v20.8h, v6.8h, v3.h[2]\n"
+      "fmla v24.8h, v6.8h, v4.h[2]\n"
+      "fmla v28.8h, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.8h, v7.8h, v0.h[2]\n"
+      "fmla v13.8h, v7.8h, v1.h[2]\n"
+      "fmla v17.8h, v7.8h, v2.h[2]\n"
+      "fmla v21.8h, v7.8h, v3.h[2]\n"
+      "fmla v25.8h, v7.8h, v4.h[2]\n"
+      "fmla v29.8h, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.8h, v6.8h, v0.h[2]\n"
+      "fmla v14.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v2.h[2]\n"
+      "fmla v22.8h, v6.8h, v3.h[2]\n"
+      "fmla v26.8h, v6.8h, v4.h[2]\n"
+      "fmla v30.8h, v6.8h, v5.h[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.8h, v7.8h, v0.h[2]\n"
+      "fmla v15.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v2.h[2]\n"
+      "fmla v23.8h, v7.8h, v3.h[2]\n"
+      "fmla v27.8h, v7.8h, v4.h[2]\n"
+      "fmla v31.8h, v7.8h, v5.h[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.8h, v6.8h, v0.h[3]\n"
+      "fmla v12.8h, v6.8h, v1.h[3]\n"
+      "fmla v16.8h, v6.8h, v2.h[3]\n"
+      "fmla v20.8h, v6.8h, v3.h[3]\n"
+      "fmla v24.8h, v6.8h, v4.h[3]\n"
+      "fmla v28.8h, v6.8h, v5.h[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.8h, v7.8h, v0.h[3]\n"
+      "fmla v13.8h, v7.8h, v1.h[3]\n"
+      "fmla v17.8h, v7.8h, v2.h[3]\n"
+      "fmla v21.8h, v7.8h, v3.h[3]\n"
+      "fmla v25.8h, v7.8h, v4.h[3]\n"
+      "fmla v29.8h, v7.8h, v5.h[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.8h, v6.8h, v0.h[3]\n"
+      "fmla v14.8h, v6.8h, v1.h[3]\n"
+      "fmla v18.8h, v6.8h, v2.h[3]\n"
+      "fmla v22.8h, v6.8h, v3.h[3]\n"
+      "fmla v26.8h, v6.8h, v4.h[3]\n"
+      "fmla v30.8h, v6.8h, v5.h[3]\n"
+      "ldr q6, [x15, #0x100]\n"
+      "fmla v11.8h, v7.8h, v0.h[3]\n"
+      "fmla v15.8h, v7.8h, v1.h[3]\n"
+      "fmla v19.8h, v7.8h, v2.h[3]\n"
+      "fmla v23.8h, v7.8h, v3.h[3]\n"
+      "fmla v27.8h, v7.8h, v4.h[3]\n"
+      "fmla v31.8h, v7.8h, v5.h[3]\n"
+      "ldr q7, [x15, #0x110]\n"
+      "fmla v8.8h, v6.8h, v0.h[4]\n"
+      "fmla v12.8h, v6.8h, v1.h[4]\n"
+      "fmla v16.8h, v6.8h, v2.h[4]\n"
+      "fmla v20.8h, v6.8h, v3.h[4]\n"
+      "fmla v24.8h, v6.8h, v4.h[4]\n"
+      "fmla v28.8h, v6.8h, v5.h[4]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v9.8h, v7.8h, v0.h[4]\n"
+      "fmla v13.8h, v7.8h, v1.h[4]\n"
+      "fmla v17.8h, v7.8h, v2.h[4]\n"
+      "fmla v21.8h, v7.8h, v3.h[4]\n"
+      "fmla v25.8h, v7.8h, v4.h[4]\n"
+      "fmla v29.8h, v7.8h, v5.h[4]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v10.8h, v6.8h, v0.h[4]\n"
+      "fmla v14.8h, v6.8h, v1.h[4]\n"
+      "fmla v18.8h, v6.8h, v2.h[4]\n"
+      "fmla v22.8h, v6.8h, v3.h[4]\n"
+      "fmla v26.8h, v6.8h, v4.h[4]\n"
+      "fmla v30.8h, v6.8h, v5.h[4]\n"
+      "ldr q6, [x15, #0x140]\n"
+      "fmla v11.8h, v7.8h, v0.h[4]\n"
+      "fmla v15.8h, v7.8h, v1.h[4]\n"
+      "fmla v19.8h, v7.8h, v2.h[4]\n"
+      "fmla v23.8h, v7.8h, v3.h[4]\n"
+      "fmla v27.8h, v7.8h, v4.h[4]\n"
+      "fmla v31.8h, v7.8h, v5.h[4]\n"
+      "ldr q7, [x15, #0x150]\n"
+      "fmla v8.8h, v6.8h, v0.h[5]\n"
+      "fmla v12.8h, v6.8h, v1.h[5]\n"
+      "fmla v16.8h, v6.8h, v2.h[5]\n"
+      "fmla v20.8h, v6.8h, v3.h[5]\n"
+      "fmla v24.8h, v6.8h, v4.h[5]\n"
+      "fmla v28.8h, v6.8h, v5.h[5]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.8h, v7.8h, v0.h[5]\n"
+      "fmla v13.8h, v7.8h, v1.h[5]\n"
+      "fmla v17.8h, v7.8h, v2.h[5]\n"
+      "fmla v21.8h, v7.8h, v3.h[5]\n"
+      "fmla v25.8h, v7.8h, v4.h[5]\n"
+      "fmla v29.8h, v7.8h, v5.h[5]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.8h, v6.8h, v0.h[5]\n"
+      "fmla v14.8h, v6.8h, v1.h[5]\n"
+      "fmla v18.8h, v6.8h, v2.h[5]\n"
+      "fmla v22.8h, v6.8h, v3.h[5]\n"
+      "fmla v26.8h, v6.8h, v4.h[5]\n"
+      "fmla v30.8h, v6.8h, v5.h[5]\n"
+      "ldr q6, [x15, #0x180]\n"
+      "fmla v11.8h, v7.8h, v0.h[5]\n"
+      "fmla v15.8h, v7.8h, v1.h[5]\n"
+      "fmla v19.8h, v7.8h, v2.h[5]\n"
+      "fmla v23.8h, v7.8h, v3.h[5]\n"
+      "fmla v27.8h, v7.8h, v4.h[5]\n"
+      "fmla v31.8h, v7.8h, v5.h[5]\n"
+      "ldr q7, [x15, #0x190]\n"
+      "fmla v8.8h, v6.8h, v0.h[6]\n"
+      "fmla v12.8h, v6.8h, v1.h[6]\n"
+      "fmla v16.8h, v6.8h, v2.h[6]\n"
+      "fmla v20.8h, v6.8h, v3.h[6]\n"
+      "fmla v24.8h, v6.8h, v4.h[6]\n"
+      "fmla v28.8h, v6.8h, v5.h[6]\n"
+      "ldr q6, [x15, #0x1a0]\n"
+      "fmla v9.8h, v7.8h, v0.h[6]\n"
+      "fmla v13.8h, v7.8h, v1.h[6]\n"
+      "fmla v17.8h, v7.8h, v2.h[6]\n"
+      "fmla v21.8h, v7.8h, v3.h[6]\n"
+      "fmla v25.8h, v7.8h, v4.h[6]\n"
+      "fmla v29.8h, v7.8h, v5.h[6]\n"
+      "ldr q7, [x15, #0x1b0]\n"
+      "fmla v10.8h, v6.8h, v0.h[6]\n"
+      "fmla v14.8h, v6.8h, v1.h[6]\n"
+      "fmla v18.8h, v6.8h, v2.h[6]\n"
+      "fmla v22.8h, v6.8h, v3.h[6]\n"
+      "fmla v26.8h, v6.8h, v4.h[6]\n"
+      "fmla v30.8h, v6.8h, v5.h[6]\n"
+      "ldr q6, [x15, #0x1c0]\n"
+      "fmla v11.8h, v7.8h, v0.h[6]\n"
+      "fmla v15.8h, v7.8h, v1.h[6]\n"
+      "fmla v19.8h, v7.8h, v2.h[6]\n"
+      "fmla v23.8h, v7.8h, v3.h[6]\n"
+      "fmla v27.8h, v7.8h, v4.h[6]\n"
+      "fmla v31.8h, v7.8h, v5.h[6]\n"
+      "ldr q7, [x15, #0x1d0]\n"
+      "fmla v8.8h, v6.8h, v0.h[7]\n"
+      "fmla v12.8h, v6.8h, v1.h[7]\n"
+      "fmla v16.8h, v6.8h, v2.h[7]\n"
+      "fmla v20.8h, v6.8h, v3.h[7]\n"
+      "fmla v24.8h, v6.8h, v4.h[7]\n"
+      "fmla v28.8h, v6.8h, v5.h[7]\n"
+      "ldr q6, [x15, #0x1e0]\n"
+      "fmla v9.8h, v7.8h, v0.h[7]\n"
+      "fmla v13.8h, v7.8h, v1.h[7]\n"
+      "fmla v17.8h, v7.8h, v2.h[7]\n"
+      "fmla v21.8h, v7.8h, v3.h[7]\n"
+      "fmla v25.8h, v7.8h, v4.h[7]\n"
+      "fmla v29.8h, v7.8h, v5.h[7]\n"
+      "ldr q7, [x15, #0x1f0]\n"
+      "fmla v10.8h, v6.8h, v0.h[7]\n"
+      "add x15, x15, #0x200\n"
+      "fmla v14.8h, v6.8h, v1.h[7]\n"
+      "fmla v18.8h, v6.8h, v2.h[7]\n"
+      "fmla v22.8h, v6.8h, v3.h[7]\n"
+      "fmla v26.8h, v6.8h, v4.h[7]\n"
+      "fmla v30.8h, v6.8h, v5.h[7]\n"
+      "fmla v11.8h, v7.8h, v0.h[7]\n"
+      "fmla v15.8h, v7.8h, v1.h[7]\n"
+      "fmla v19.8h, v7.8h, v2.h[7]\n"
+      "fmla v23.8h, v7.8h, v3.h[7]\n"
+      "fmla v27.8h, v7.8h, v4.h[7]\n"
+      "fmla v31.8h, v7.8h, v5.h[7]\n"
+      "279:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x11, 281f\n"
+      "280:"  // Height 6: Multiply loop: Odd block loop
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x20], #0x2\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.8h, v6.8h, v0.h[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.8h, v6.8h, v1.h[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.8h, v6.8h, v2.h[0]\n"
+      "fmla v20.8h, v6.8h, v3.h[0]\n"
+      "fmla v24.8h, v6.8h, v4.h[0]\n"
+      "fmla v28.8h, v6.8h, v5.h[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[0]\n"
+      "fmla v17.8h, v7.8h, v2.h[0]\n"
+      "fmla v21.8h, v7.8h, v3.h[0]\n"
+      "fmla v25.8h, v7.8h, v4.h[0]\n"
+      "fmla v29.8h, v7.8h, v5.h[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.8h, v6.8h, v1.h[0]\n"
+      "fmla v18.8h, v6.8h, v2.h[0]\n"
+      "fmla v22.8h, v6.8h, v3.h[0]\n"
+      "fmla v26.8h, v6.8h, v4.h[0]\n"
+      "fmla v30.8h, v6.8h, v5.h[0]\n"
+      "fmla v11.8h, v7.8h, v0.h[0]\n"
+      "fmla v15.8h, v7.8h, v1.h[0]\n"
+      "fmla v19.8h, v7.8h, v2.h[0]\n"
+      "fmla v23.8h, v7.8h, v3.h[0]\n"
+      "fmla v27.8h, v7.8h, v4.h[0]\n"
+      "fmla v31.8h, v7.8h, v5.h[0]\n"
+      "cbnz x11, 280b\n"
+      "281:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 274b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 282f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.8h }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.8h }, [x19]\n"
+      "fmin v8.8h, v8.8h, v0.8h\n"
+      "fmin v9.8h, v9.8h, v0.8h\n"
+      "fmin v10.8h, v10.8h, v0.8h\n"
+      "fmin v11.8h, v11.8h, v0.8h\n"
+      "fmax v8.8h, v8.8h, v1.8h\n"
+      "fmax v9.8h, v9.8h, v1.8h\n"
+      "fmax v10.8h, v10.8h, v1.8h\n"
+      "fmax v11.8h, v11.8h, v1.8h\n"
+      "fmin v12.8h, v12.8h, v0.8h\n"
+      "fmin v13.8h, v13.8h, v0.8h\n"
+      "fmin v14.8h, v14.8h, v0.8h\n"
+      "fmax v12.8h, v12.8h, v1.8h\n"
+      "fmax v13.8h, v13.8h, v1.8h\n"
+      "fmax v14.8h, v14.8h, v1.8h\n"
+      "fmin v15.8h, v15.8h, v0.8h\n"
+      "fmin v16.8h, v16.8h, v0.8h\n"
+      "fmin v17.8h, v17.8h, v0.8h\n"
+      "fmax v15.8h, v15.8h, v1.8h\n"
+      "fmax v16.8h, v16.8h, v1.8h\n"
+      "fmax v17.8h, v17.8h, v1.8h\n"
+      "fmin v18.8h, v18.8h, v0.8h\n"
+      "fmin v19.8h, v19.8h, v0.8h\n"
+      "fmin v20.8h, v20.8h, v0.8h\n"
+      "fmax v18.8h, v18.8h, v1.8h\n"
+      "fmax v19.8h, v19.8h, v1.8h\n"
+      "fmax v20.8h, v20.8h, v1.8h\n"
+      "fmin v21.8h, v21.8h, v0.8h\n"
+      "fmin v22.8h, v22.8h, v0.8h\n"
+      "fmin v23.8h, v23.8h, v0.8h\n"
+      "fmax v21.8h, v21.8h, v1.8h\n"
+      "fmax v22.8h, v22.8h, v1.8h\n"
+      "fmax v23.8h, v23.8h, v1.8h\n"
+      "fmin v24.8h, v24.8h, v0.8h\n"
+      "fmin v25.8h, v25.8h, v0.8h\n"
+      "fmin v26.8h, v26.8h, v0.8h\n"
+      "fmax v24.8h, v24.8h, v1.8h\n"
+      "fmax v25.8h, v25.8h, v1.8h\n"
+      "fmax v26.8h, v26.8h, v1.8h\n"
+      "fmin v27.8h, v27.8h, v0.8h\n"
+      "fmin v28.8h, v28.8h, v0.8h\n"
+      "fmin v29.8h, v29.8h, v0.8h\n"
+      "fmax v27.8h, v27.8h, v1.8h\n"
+      "fmax v28.8h, v28.8h, v1.8h\n"
+      "fmax v29.8h, v29.8h, v1.8h\n"
+      "fmin v30.8h, v30.8h, v0.8h\n"
+      "fmin v31.8h, v31.8h, v0.8h\n"
+      "fmax v30.8h, v30.8h, v1.8h\n"
+      "fmax v31.8h, v31.8h, v1.8h\n"
+      "282:"  // Height 6: No activation
+      "cmp x16, #0x20\n"
+      "bge 299f\n"
+      "tbz x16, #4, 290f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v9.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v13.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v17.8h }, [x27], #0x10\n"
+      "st1 { v20.8h }, [x25], #0x10\n"
+      "st1 { v21.8h }, [x25], #0x10\n"
+      "st1 { v24.8h }, [x23], #0x10\n"
+      "st1 { v25.8h }, [x23], #0x10\n"
+      "st1 { v28.8h }, [x21], #0x10\n"
+      "st1 { v29.8h }, [x21], #0x10\n"
+      "tbz x16, #3, 286f\n"
+      "st1 { v10.8h }, [x13], #0x10\n"
+      "st1 { v14.8h }, [x9], #0x10\n"
+      "st1 { v18.8h }, [x27], #0x10\n"
+      "st1 { v22.8h }, [x25], #0x10\n"
+      "st1 { v26.8h }, [x23], #0x10\n"
+      "st1 { v30.8h }, [x21], #0x10\n"
+      "tbz x16, #2, 284f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x16, #1, 283f\n"
+      "st1 { v11.s }[2], [x13], #0x4\n"
+      "st1 { v15.s }[2], [x9], #0x4\n"
+      "st1 { v19.s }[2], [x27], #0x4\n"
+      "st1 { v23.s }[2], [x25], #0x4\n"
+      "st1 { v27.s }[2], [x23], #0x4\n"
+      "st1 { v31.s }[2], [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v11.h }[6], [x13]\n"
+      "st1 { v15.h }[6], [x9]\n"
+      "st1 { v19.h }[6], [x27]\n"
+      "st1 { v23.h }[6], [x25]\n"
+      "st1 { v27.h }[6], [x23]\n"
+      "st1 { v31.h }[6], [x21]\n"
+      "b 298f\n"
+      "283:"  // Height 6: Partial direct writeback: partial_1_28
+      "tbz x16, #0, 298f\n"
+      "st1 { v11.h }[4], [x13]\n"
+      "st1 { v15.h }[4], [x9]\n"
+      "st1 { v19.h }[4], [x27]\n"
+      "st1 { v23.h }[4], [x25]\n"
+      "st1 { v27.h }[4], [x23]\n"
+      "st1 { v31.h }[4], [x21]\n"
+      "b 298f\n"
+      "284:"  // Height 6: Partial direct writeback: partial_2_24
+      "tbz x16, #1, 285f\n"
+      "str s11, [x13], #0x4\n"
+      "str s15, [x9], #0x4\n"
+      "str s19, [x27], #0x4\n"
+      "str s23, [x25], #0x4\n"
+      "str s27, [x23], #0x4\n"
+      "str s31, [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v11.h }[2], [x13]\n"
+      "st1 { v15.h }[2], [x9]\n"
+      "st1 { v19.h }[2], [x27]\n"
+      "st1 { v23.h }[2], [x25]\n"
+      "st1 { v27.h }[2], [x23]\n"
+      "st1 { v31.h }[2], [x21]\n"
+      "b 298f\n"
+      "285:"  // Height 6: Partial direct writeback: partial_1_24
+      "tbz x16, #0, 298f\n"
+      "str h11, [x13, #0x0]\n"
+      "str h15, [x9, #0x0]\n"
+      "str h19, [x27, #0x0]\n"
+      "str h23, [x25, #0x0]\n"
+      "str h27, [x23, #0x0]\n"
+      "str h31, [x21, #0x0]\n"
+      "b 298f\n"
+      "286:"  // Height 6: Partial direct writeback: partial_4_16
+      "tbz x16, #2, 288f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x16, #1, 287f\n"
+      "st1 { v10.s }[2], [x13], #0x4\n"
+      "st1 { v14.s }[2], [x9], #0x4\n"
+      "st1 { v18.s }[2], [x27], #0x4\n"
+      "st1 { v22.s }[2], [x25], #0x4\n"
+      "st1 { v26.s }[2], [x23], #0x4\n"
+      "st1 { v30.s }[2], [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v10.h }[6], [x13]\n"
+      "st1 { v14.h }[6], [x9]\n"
+      "st1 { v18.h }[6], [x27]\n"
+      "st1 { v22.h }[6], [x25]\n"
+      "st1 { v26.h }[6], [x23]\n"
+      "st1 { v30.h }[6], [x21]\n"
+      "b 298f\n"
+      "287:"  // Height 6: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 298f\n"
+      "st1 { v10.h }[4], [x13]\n"
+      "st1 { v14.h }[4], [x9]\n"
+      "st1 { v18.h }[4], [x27]\n"
+      "st1 { v22.h }[4], [x25]\n"
+      "st1 { v26.h }[4], [x23]\n"
+      "st1 { v30.h }[4], [x21]\n"
+      "b 298f\n"
+      "288:"  // Height 6: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 289f\n"
+      "str s10, [x13], #0x4\n"
+      "str s14, [x9], #0x4\n"
+      "str s18, [x27], #0x4\n"
+      "str s22, [x25], #0x4\n"
+      "str s26, [x23], #0x4\n"
+      "str s30, [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v10.h }[2], [x13]\n"
+      "st1 { v14.h }[2], [x9]\n"
+      "st1 { v18.h }[2], [x27]\n"
+      "st1 { v22.h }[2], [x25]\n"
+      "st1 { v26.h }[2], [x23]\n"
+      "st1 { v30.h }[2], [x21]\n"
+      "b 298f\n"
+      "289:"  // Height 6: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 298f\n"
+      "str h10, [x13, #0x0]\n"
+      "str h14, [x9, #0x0]\n"
+      "str h18, [x27, #0x0]\n"
+      "str h22, [x25, #0x0]\n"
+      "str h26, [x23, #0x0]\n"
+      "str h30, [x21, #0x0]\n"
+      "b 298f\n"
+      "290:"  // Height 6: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 294f\n"
+      "st1 { v8.8h }, [x13], #0x10\n"
+      "st1 { v12.8h }, [x9], #0x10\n"
+      "st1 { v16.8h }, [x27], #0x10\n"
+      "st1 { v20.8h }, [x25], #0x10\n"
+      "st1 { v24.8h }, [x23], #0x10\n"
+      "st1 { v28.8h }, [x21], #0x10\n"
+      "tbz x16, #2, 292f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x16, #1, 291f\n"
+      "st1 { v9.s }[2], [x13], #0x4\n"
+      "st1 { v13.s }[2], [x9], #0x4\n"
+      "st1 { v17.s }[2], [x27], #0x4\n"
+      "st1 { v21.s }[2], [x25], #0x4\n"
+      "st1 { v25.s }[2], [x23], #0x4\n"
+      "st1 { v29.s }[2], [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v9.h }[6], [x13]\n"
+      "st1 { v13.h }[6], [x9]\n"
+      "st1 { v17.h }[6], [x27]\n"
+      "st1 { v21.h }[6], [x25]\n"
+      "st1 { v25.h }[6], [x23]\n"
+      "st1 { v29.h }[6], [x21]\n"
+      "b 298f\n"
+      "291:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 298f\n"
+      "st1 { v9.h }[4], [x13]\n"
+      "st1 { v13.h }[4], [x9]\n"
+      "st1 { v17.h }[4], [x27]\n"
+      "st1 { v21.h }[4], [x25]\n"
+      "st1 { v25.h }[4], [x23]\n"
+      "st1 { v29.h }[4], [x21]\n"
+      "b 298f\n"
+      "292:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 293f\n"
+      "str s9, [x13], #0x4\n"
+      "str s13, [x9], #0x4\n"
+      "str s17, [x27], #0x4\n"
+      "str s21, [x25], #0x4\n"
+      "str s25, [x23], #0x4\n"
+      "str s29, [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v9.h }[2], [x13]\n"
+      "st1 { v13.h }[2], [x9]\n"
+      "st1 { v17.h }[2], [x27]\n"
+      "st1 { v21.h }[2], [x25]\n"
+      "st1 { v25.h }[2], [x23]\n"
+      "st1 { v29.h }[2], [x21]\n"
+      "b 298f\n"
+      "293:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 298f\n"
+      "str h9, [x13, #0x0]\n"
+      "str h13, [x9, #0x0]\n"
+      "str h17, [x27, #0x0]\n"
+      "str h21, [x25, #0x0]\n"
+      "str h25, [x23, #0x0]\n"
+      "str h29, [x21, #0x0]\n"
+      "b 298f\n"
+      "294:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 296f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x16, #1, 295f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v8.h }[6], [x13]\n"
+      "st1 { v12.h }[6], [x9]\n"
+      "st1 { v16.h }[6], [x27]\n"
+      "st1 { v20.h }[6], [x25]\n"
+      "st1 { v24.h }[6], [x23]\n"
+      "st1 { v28.h }[6], [x21]\n"
+      "b 298f\n"
+      "295:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 298f\n"
+      "st1 { v8.h }[4], [x13]\n"
+      "st1 { v12.h }[4], [x9]\n"
+      "st1 { v16.h }[4], [x27]\n"
+      "st1 { v20.h }[4], [x25]\n"
+      "st1 { v24.h }[4], [x23]\n"
+      "st1 { v28.h }[4], [x21]\n"
+      "b 298f\n"
+      "296:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 297f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x16, #0, 298f\n"
+      "st1 { v8.h }[2], [x13]\n"
+      "st1 { v12.h }[2], [x9]\n"
+      "st1 { v16.h }[2], [x27]\n"
+      "st1 { v20.h }[2], [x25]\n"
+      "st1 { v24.h }[2], [x23]\n"
+      "st1 { v28.h }[2], [x21]\n"
+      "b 298f\n"
+      "297:"  // Height 6: Partial direct writeback: partial_1_0
+      "str h8, [x13, #0x0]\n"
+      "str h12, [x9, #0x0]\n"
+      "str h16, [x27, #0x0]\n"
+      "str h20, [x25, #0x0]\n"
+      "str h24, [x23, #0x0]\n"
+      "str h28, [x21, #0x0]\n"
+      "298:"  // Height 6: Partial direct writeback: Done
+      "b 300f\n"
+      "299:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "add x21, x21, #0x40\n"
+      "300:"  // Height 6: Writeback done
+      "subs x16, x16, #0x20\n"
+      "bgt 253b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 302f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 301f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "301:"  // Update direct input
+      "mov x19, #0xc\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "302:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
deleted file mode 100644
index 94fcd1064e..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
+++ /dev/null
@@ -1,2427 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = K;
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long blocks_count = K / 1;
-    float nullbias[16];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (16 * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            float result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
-            float *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "temploadreg0 .req X0\n"
-                        "temploadreg1 .req X1\n"
-                        "temploadreg2 .req X2\n"
-                        "temploadreg3 .req X3\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "temploadreg0 .req X2\n"
-                        "temploadreg1 .req X3\n"
-                        "temploadreg2 .req X4\n"
-                        "temploadreg3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "temploadreg0 .req X4\n"
-                        "temploadreg1 .req X5\n"
-                        "temploadreg2 .req X6\n"
-                        "temploadreg3 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "mov v27.16b, v19.16b\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "temploadreg0 .req X6\n"
-                        "temploadreg1 .req X7\n"
-                        "temploadreg2 .req X8\n"
-                        "temploadreg3 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v27.16b, v19.16b\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "mov v28.16b, v16.16b\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "mov v29.16b, v17.16b\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "mov v30.16b, v18.16b\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "mov v31.16b, v19.16b\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v28.4s, v12.4s, v7.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        "fmla v29.4s, v13.4s, v7.s[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "fmla v30.4s, v14.4s, v7.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v31.4s, v15.4s, v7.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr d3, [a_ptr3, #-0x10]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v28.4s, v12.4s, v7.s[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v29.4s, v13.4s, v7.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v30.4s, v14.4s, v7.s[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v31.4s, v15.4s, v7.s[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v28.4s, v12.4s, v7.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v29.4s, v13.4s, v7.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v30.4s, v14.4s, v7.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v31.4s, v15.4s, v7.s[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v28.4s, v12.4s, v7.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v29.4s, v13.4s, v7.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v30.4s, v14.4s, v7.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "fmla v31.4s, v15.4s, v7.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmax v28.4s, v28.4s, v14.4s\n"
-                        "fmax v29.4s, v29.4s, v14.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "fmax v30.4s, v30.4s, v14.4s\n"
-                        "fmin v28.4s, v28.4s, v15.4s\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "fmin v29.4s, v29.4s, v15.4s\n"
-                        "fmax v31.4s, v31.4s, v14.4s\n"
-                        "fmin v30.4s, v30.4s, v15.4s\n"
-                        "str q24, [c_ptr2]\n"
-                        "fmin v31.4s, v31.4s, v15.4s\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
deleted file mode 100644
index 016bef4b9d..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
+++ /dev/null
@@ -1,1802 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = K;
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long blocks_count = K / 1;
-    float nullbias[16];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (16 * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            float result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
-            float *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "mov v27.16b, v19.16b\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v27.16b, v19.16b\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "mov v28.16b, v16.16b\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "mov v29.16b, v17.16b\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "mov v30.16b, v18.16b\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "mov v31.16b, v19.16b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "fmla v28.4s, v12.4s, v7.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "fmla v29.4s, v13.4s, v7.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "fmla v30.4s, v14.4s, v7.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "fmla v31.4s, v15.4s, v7.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v28.4s, v12.4s, v7.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v29.4s, v13.4s, v7.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v30.4s, v14.4s, v7.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "fmla v31.4s, v15.4s, v7.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "fmla v28.4s, v12.4s, v7.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "fmla v29.4s, v13.4s, v7.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "fmla v30.4s, v14.4s, v7.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "fmla v31.4s, v15.4s, v7.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v28.4s, v12.4s, v7.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v29.4s, v13.4s, v7.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v30.4s, v14.4s, v7.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "fmla v31.4s, v15.4s, v7.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmax v28.4s, v28.4s, v14.4s\n"
-                        "fmax v29.4s, v29.4s, v14.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "fmax v30.4s, v30.4s, v14.4s\n"
-                        "fmin v28.4s, v28.4s, v15.4s\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "fmin v29.4s, v29.4s, v15.4s\n"
-                        "fmax v31.4s, v31.4s, v14.4s\n"
-                        "fmin v30.4s, v30.4s, v15.4s\n"
-                        "str q24, [c_ptr2]\n"
-                        "fmin v31.4s, v31.4s, v15.4s\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp
deleted file mode 100644
index 3f1df76a6a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp
+++ /dev/null
@@ -1,1810 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_16x4_x1(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = K;
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long blocks_count = K / 1;
-    float nullbias[16];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (16 * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            float result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
-            float *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "fmla v20.4s, v8.4s, v5.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "fmla v21.4s, v9.4s, v5.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "fmla v22.4s, v10.4s, v5.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "fmla v23.4s, v11.4s, v5.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "fmla v20.4s, v8.4s, v5.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v21.4s, v9.4s, v5.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v22.4s, v10.4s, v5.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "fmla v23.4s, v11.4s, v5.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "fmla v20.4s, v8.4s, v5.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "fmla v21.4s, v9.4s, v5.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "fmla v22.4s, v10.4s, v5.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "fmla v23.4s, v11.4s, v5.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v20.4s, v8.4s, v5.s[3]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v21.4s, v9.4s, v5.s[3]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v22.4s, v10.4s, v5.s[3]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "fmla v23.4s, v11.4s, v5.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov v27.16b, v19.16b\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v24.4s, v8.4s, v2.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "fmla v25.4s, v9.4s, v2.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "fmla v26.4s, v10.4s, v2.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "fmla v27.4s, v11.4s, v2.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v25.4s, v9.4s, v2.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v26.4s, v10.4s, v2.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v27.4s, v11.4s, v2.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "fmla v20.4s, v8.4s, v5.s[1]\n"
-                        "fmla v24.4s, v8.4s, v6.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "fmla v21.4s, v9.4s, v5.s[1]\n"
-                        "fmla v25.4s, v9.4s, v6.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "fmla v22.4s, v10.4s, v5.s[1]\n"
-                        "fmla v26.4s, v10.4s, v6.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "fmla v23.4s, v11.4s, v5.s[1]\n"
-                        "fmla v27.4s, v11.4s, v6.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "fmla v20.4s, v8.4s, v5.s[3]\n"
-                        "fmla v24.4s, v8.4s, v6.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v21.4s, v9.4s, v5.s[3]\n"
-                        "fmla v25.4s, v9.4s, v6.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v22.4s, v10.4s, v5.s[3]\n"
-                        "fmla v26.4s, v10.4s, v6.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "fmla v23.4s, v11.4s, v5.s[3]\n"
-                        "fmla v27.4s, v11.4s, v6.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "fmla v25.4s, v9.4s, v2.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "fmla v26.4s, v10.4s, v2.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "fmla v27.4s, v11.4s, v2.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v25.4s, v9.4s, v2.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v26.4s, v10.4s, v2.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "fmla v27.4s, v11.4s, v2.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "fmla v20.4s, v8.4s, v5.s[1]\n"
-                        "fmla v24.4s, v8.4s, v6.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "fmla v21.4s, v9.4s, v5.s[1]\n"
-                        "fmla v25.4s, v9.4s, v6.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "fmla v22.4s, v10.4s, v5.s[1]\n"
-                        "fmla v26.4s, v10.4s, v6.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "fmla v23.4s, v11.4s, v5.s[1]\n"
-                        "fmla v27.4s, v11.4s, v6.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v20.4s, v8.4s, v5.s[3]\n"
-                        "fmla v24.4s, v8.4s, v6.s[3]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v21.4s, v9.4s, v5.s[3]\n"
-                        "fmla v25.4s, v9.4s, v6.s[3]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v22.4s, v10.4s, v5.s[3]\n"
-                        "fmla v26.4s, v10.4s, v6.s[3]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "fmla v23.4s, v11.4s, v5.s[3]\n"
-                        "fmla v27.4s, v11.4s, v6.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "fmla v25.4s, v9.4s, v2.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "fmla v26.4s, v10.4s, v2.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "fmla v27.4s, v11.4s, v2.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v25.4s, v9.4s, v2.s[3]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v26.4s, v10.4s, v2.s[3]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "fmla v27.4s, v11.4s, v2.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v27.16b, v19.16b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov v28.16b, v16.16b\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "mov v29.16b, v17.16b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "mov v30.16b, v18.16b\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov v31.16b, v19.16b\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v24.4s, v8.4s, v2.s[1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "fmla v25.4s, v9.4s, v2.s[1]\n"
-                        "fmla v29.4s, v9.4s, v3.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "fmla v26.4s, v10.4s, v2.s[1]\n"
-                        "fmla v30.4s, v10.4s, v3.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "fmla v27.4s, v11.4s, v2.s[1]\n"
-                        "fmla v31.4s, v11.4s, v3.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[3]\n"
-                        "fmla v28.4s, v8.4s, v3.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v25.4s, v9.4s, v2.s[3]\n"
-                        "fmla v29.4s, v9.4s, v3.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v26.4s, v10.4s, v2.s[3]\n"
-                        "fmla v30.4s, v10.4s, v3.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v27.4s, v11.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v31.4s, v11.4s, v3.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "fmla v20.4s, v8.4s, v5.s[1]\n"
-                        "fmla v24.4s, v8.4s, v6.s[1]\n"
-                        "fmla v28.4s, v8.4s, v7.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "fmla v21.4s, v9.4s, v5.s[1]\n"
-                        "fmla v25.4s, v9.4s, v6.s[1]\n"
-                        "fmla v29.4s, v9.4s, v7.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "fmla v22.4s, v10.4s, v5.s[1]\n"
-                        "fmla v26.4s, v10.4s, v6.s[1]\n"
-                        "fmla v30.4s, v10.4s, v7.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "fmla v23.4s, v11.4s, v5.s[1]\n"
-                        "fmla v27.4s, v11.4s, v6.s[1]\n"
-                        "fmla v31.4s, v11.4s, v7.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "fmla v20.4s, v8.4s, v5.s[3]\n"
-                        "fmla v24.4s, v8.4s, v6.s[3]\n"
-                        "fmla v28.4s, v8.4s, v7.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v21.4s, v9.4s, v5.s[3]\n"
-                        "fmla v25.4s, v9.4s, v6.s[3]\n"
-                        "fmla v29.4s, v9.4s, v7.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v22.4s, v10.4s, v5.s[3]\n"
-                        "fmla v26.4s, v10.4s, v6.s[3]\n"
-                        "fmla v30.4s, v10.4s, v7.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "fmla v23.4s, v11.4s, v5.s[3]\n"
-                        "fmla v27.4s, v11.4s, v6.s[3]\n"
-                        "fmla v31.4s, v11.4s, v7.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "fmla v25.4s, v9.4s, v2.s[1]\n"
-                        "fmla v29.4s, v9.4s, v3.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "fmla v26.4s, v10.4s, v2.s[1]\n"
-                        "fmla v30.4s, v10.4s, v3.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "fmla v27.4s, v11.4s, v2.s[1]\n"
-                        "fmla v31.4s, v11.4s, v3.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[3]\n"
-                        "fmla v28.4s, v8.4s, v3.s[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v25.4s, v9.4s, v2.s[3]\n"
-                        "fmla v29.4s, v9.4s, v3.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v26.4s, v10.4s, v2.s[3]\n"
-                        "fmla v30.4s, v10.4s, v3.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "fmla v27.4s, v11.4s, v2.s[3]\n"
-                        "fmla v31.4s, v11.4s, v3.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x10]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[1]\n"
-                        "fmla v20.4s, v8.4s, v5.s[1]\n"
-                        "fmla v24.4s, v8.4s, v6.s[1]\n"
-                        "fmla v28.4s, v8.4s, v7.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v4.s[1]\n"
-                        "fmla v21.4s, v9.4s, v5.s[1]\n"
-                        "fmla v25.4s, v9.4s, v6.s[1]\n"
-                        "fmla v29.4s, v9.4s, v7.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v4.s[1]\n"
-                        "fmla v22.4s, v10.4s, v5.s[1]\n"
-                        "fmla v26.4s, v10.4s, v6.s[1]\n"
-                        "fmla v30.4s, v10.4s, v7.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[1]\n"
-                        "fmla v23.4s, v11.4s, v5.s[1]\n"
-                        "fmla v27.4s, v11.4s, v6.s[1]\n"
-                        "fmla v31.4s, v11.4s, v7.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v20.4s, v8.4s, v5.s[3]\n"
-                        "fmla v24.4s, v8.4s, v6.s[3]\n"
-                        "fmla v28.4s, v8.4s, v7.s[3]\n"
-                        "fmla v17.4s, v9.4s, v4.s[3]\n"
-                        "fmla v21.4s, v9.4s, v5.s[3]\n"
-                        "fmla v25.4s, v9.4s, v6.s[3]\n"
-                        "fmla v29.4s, v9.4s, v7.s[3]\n"
-                        "fmla v18.4s, v10.4s, v4.s[3]\n"
-                        "fmla v22.4s, v10.4s, v5.s[3]\n"
-                        "fmla v26.4s, v10.4s, v6.s[3]\n"
-                        "fmla v30.4s, v10.4s, v7.s[3]\n"
-                        "fmla v19.4s, v11.4s, v4.s[3]\n"
-                        "fmla v23.4s, v11.4s, v5.s[3]\n"
-                        "fmla v27.4s, v11.4s, v6.s[3]\n"
-                        "fmla v31.4s, v11.4s, v7.s[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[1]\n"
-                        "fmla v20.4s, v8.4s, v1.s[1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[1]\n"
-                        "ldr q8, [%[b_ptr0], #0x40]\n"
-                        "fmla v17.4s, v9.4s, v0.s[1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[1]\n"
-                        "fmla v25.4s, v9.4s, v2.s[1]\n"
-                        "fmla v29.4s, v9.4s, v3.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x50]\n"
-                        "fmla v18.4s, v10.4s, v0.s[1]\n"
-                        "fmla v22.4s, v10.4s, v1.s[1]\n"
-                        "fmla v26.4s, v10.4s, v2.s[1]\n"
-                        "fmla v30.4s, v10.4s, v3.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[1]\n"
-                        "fmla v23.4s, v11.4s, v1.s[1]\n"
-                        "fmla v27.4s, v11.4s, v2.s[1]\n"
-                        "fmla v31.4s, v11.4s, v3.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x70]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "fmla v16.4s, v8.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
-                        "fmla v20.4s, v8.4s, v1.s[3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[3]\n"
-                        "fmla v28.4s, v8.4s, v3.s[3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[3]\n"
-                        "fmla v25.4s, v9.4s, v2.s[3]\n"
-                        "fmla v29.4s, v9.4s, v3.s[3]\n"
-                        "fmla v18.4s, v10.4s, v0.s[3]\n"
-                        "fmla v22.4s, v10.4s, v1.s[3]\n"
-                        "fmla v26.4s, v10.4s, v2.s[3]\n"
-                        "fmla v30.4s, v10.4s, v3.s[3]\n"
-                        "fmla v19.4s, v11.4s, v0.s[3]\n"
-                        "fmla v23.4s, v11.4s, v1.s[3]\n"
-                        "fmla v27.4s, v11.4s, v2.s[3]\n"
-                        "fmla v31.4s, v11.4s, v3.s[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmax v28.4s, v28.4s, v14.4s\n"
-                        "fmax v29.4s, v29.4s, v14.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "fmax v30.4s, v30.4s, v14.4s\n"
-                        "fmin v28.4s, v28.4s, v15.4s\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "fmin v29.4s, v29.4s, v15.4s\n"
-                        "fmax v31.4s, v31.4s, v14.4s\n"
-                        "fmin v30.4s, v30.4s, v15.4s\n"
-                        "str q24, [c_ptr2]\n"
-                        "fmin v31.4s, v31.4s, v15.4s\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
deleted file mode 100644
index 7442d258ec..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
+++ /dev/null
@@ -1,1934 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = K;
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long blocks_count = K / 1;
-    float nullbias[4];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (4 * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 8) {
-            if (rows_to_compute % 8) {
-                rows_to_compute = 8 - 1;
-            } else {
-                rows_to_compute = 8;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=4ul) {
-            const long width = std::min((unsigned long)N-x0, 4ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 4);
-            float result_buffer[32];
-            const unsigned long ldcb = (use_result_buffer ? 4 : ldc) * sizeof(float);
-            float *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 8); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 4 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "ldr q24, [%[biasptr]]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "str q25, [c_ptr1]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v26.16b, v24.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmax v26.4s, v26.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "fmin v26.4s, v26.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "str q25, [c_ptr1]\n"
-                        "str q26, [c_ptr2]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v26.16b, v24.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v27.16b, v24.16b\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmax v26.4s, v26.4s, v22.4s\n"
-                        "fmax v27.4s, v27.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "fmin v26.4s, v26.4s, v23.4s\n"
-                        "fmin v27.4s, v27.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "str q25, [c_ptr1]\n"
-                        "str q26, [c_ptr2]\n"
-                        "str q27, [c_ptr3]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-                case 5:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "c_ptr1 .req X4\n"
-                        "c_ptr2 .req X5\n"
-                        "c_ptr3 .req X6\n"
-                        "c_ptr4 .req X7\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v26.16b, v24.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v27.16b, v24.16b\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "mov v28.16b, v24.16b\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "ldr q4, [a_ptr4]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "ldr q4, [a_ptr4, #-0x10]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "prfm PSTL1KEEP, [c_ptr4]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "ldr s4, [a_ptr4]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "add a_ptr4, a_ptr4, #0x4\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmax v26.4s, v26.4s, v22.4s\n"
-                        "fmax v27.4s, v27.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "fmin v26.4s, v26.4s, v23.4s\n"
-                        "fmin v27.4s, v27.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "fmax v28.4s, v28.4s, v22.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "str q25, [c_ptr1]\n"
-                        "fmin v28.4s, v28.4s, v23.4s\n"
-                        "str q26, [c_ptr2]\n"
-                        "str q27, [c_ptr3]\n"
-                        "str q28, [c_ptr4]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
-                    );
-                    break;
-                case 6:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "c_ptr1 .req X5\n"
-                        "c_ptr2 .req X6\n"
-                        "c_ptr3 .req X7\n"
-                        "c_ptr4 .req X8\n"
-                        "c_ptr5 .req X9\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v26.16b, v24.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v27.16b, v24.16b\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "mov v28.16b, v24.16b\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "mov v29.16b, v24.16b\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "ldr q4, [a_ptr4]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "ldr q5, [a_ptr5]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q13, [a_ptr5]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "ldr q4, [a_ptr4, #-0x10]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "ldr q5, [a_ptr5, #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v29.4s, v16.4s, v13.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v29.4s, v17.4s, v13.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v29.4s, v18.4s, v13.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "fmla v29.4s, v19.4s, v13.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "prfm PSTL1KEEP, [c_ptr4]\n"
-                        "prfm PSTL1KEEP, [c_ptr5]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "ldr q13, [a_ptr5]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v29.4s, v16.4s, v13.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v29.4s, v17.4s, v13.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v29.4s, v18.4s, v13.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "fmla v29.4s, v19.4s, v13.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "ldr s4, [a_ptr4]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "add a_ptr4, a_ptr4, #0x4\n"
-                        "ldr s5, [a_ptr5]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "add a_ptr5, a_ptr5, #0x4\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmax v26.4s, v26.4s, v22.4s\n"
-                        "fmax v27.4s, v27.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "fmin v26.4s, v26.4s, v23.4s\n"
-                        "fmin v27.4s, v27.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "fmax v28.4s, v28.4s, v22.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "fmax v29.4s, v29.4s, v22.4s\n"
-                        "str q25, [c_ptr1]\n"
-                        "fmin v28.4s, v28.4s, v23.4s\n"
-                        "fmin v29.4s, v29.4s, v23.4s\n"
-                        "str q26, [c_ptr2]\n"
-                        "str q27, [c_ptr3]\n"
-                        "str q28, [c_ptr4]\n"
-                        "str q29, [c_ptr5]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-                    );
-                    break;
-                case 7:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "a_ptr6 .req X5\n"
-                        "c_ptr1 .req X6\n"
-                        "c_ptr2 .req X7\n"
-                        "c_ptr3 .req X8\n"
-                        "c_ptr4 .req X9\n"
-                        "c_ptr5 .req X10\n"
-                        "c_ptr6 .req X11\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v26.16b, v24.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v27.16b, v24.16b\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "mov v28.16b, v24.16b\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "mov v29.16b, v24.16b\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "mov v30.16b, v24.16b\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "ldr q4, [a_ptr4]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "ldr q5, [a_ptr5]\n"
-                        "add a_ptr6, a_ptr5, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "ldr q6, [a_ptr6]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add c_ptr6, c_ptr5, %[ldc]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "ldr q13, [a_ptr5]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q14, [a_ptr6]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v30.4s, v17.4s, v6.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "add a_ptr6, a_ptr6, #0x20\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v30.4s, v18.4s, v6.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "ldr q4, [a_ptr4, #-0x10]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "ldr q5, [a_ptr5, #-0x10]\n"
-                        "fmla v30.4s, v19.4s, v6.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "ldr q6, [a_ptr6, #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v29.4s, v16.4s, v13.s[0]\n"
-                        "fmla v30.4s, v16.4s, v14.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v29.4s, v17.4s, v13.s[1]\n"
-                        "fmla v30.4s, v17.4s, v14.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v29.4s, v18.4s, v13.s[2]\n"
-                        "fmla v30.4s, v18.4s, v14.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "fmla v29.4s, v19.4s, v13.s[3]\n"
-                        "fmla v30.4s, v19.4s, v14.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "prfm PSTL1KEEP, [c_ptr4]\n"
-                        "prfm PSTL1KEEP, [c_ptr5]\n"
-                        "prfm PSTL1KEEP, [c_ptr6]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "ldr q13, [a_ptr5]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "ldr q14, [a_ptr6]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "fmla v30.4s, v17.4s, v6.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "fmla v30.4s, v18.4s, v6.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "fmla v30.4s, v19.4s, v6.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v29.4s, v16.4s, v13.s[0]\n"
-                        "fmla v30.4s, v16.4s, v14.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v29.4s, v17.4s, v13.s[1]\n"
-                        "fmla v30.4s, v17.4s, v14.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v29.4s, v18.4s, v13.s[2]\n"
-                        "fmla v30.4s, v18.4s, v14.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "fmla v29.4s, v19.4s, v13.s[3]\n"
-                        "fmla v30.4s, v19.4s, v14.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "fmla v30.4s, v17.4s, v6.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "fmla v30.4s, v18.4s, v6.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "fmla v30.4s, v19.4s, v6.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "ldr s4, [a_ptr4]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "add a_ptr4, a_ptr4, #0x4\n"
-                        "ldr s5, [a_ptr5]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "add a_ptr5, a_ptr5, #0x4\n"
-                        "ldr s6, [a_ptr6]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "add a_ptr6, a_ptr6, #0x4\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmax v26.4s, v26.4s, v22.4s\n"
-                        "fmax v27.4s, v27.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "fmin v26.4s, v26.4s, v23.4s\n"
-                        "fmin v27.4s, v27.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "fmax v28.4s, v28.4s, v22.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "fmax v29.4s, v29.4s, v22.4s\n"
-                        "str q25, [c_ptr1]\n"
-                        "fmax v30.4s, v30.4s, v22.4s\n"
-                        "fmin v28.4s, v28.4s, v23.4s\n"
-                        "fmin v29.4s, v29.4s, v23.4s\n"
-                        "str q26, [c_ptr2]\n"
-                        "fmin v30.4s, v30.4s, v23.4s\n"
-                        "str q27, [c_ptr3]\n"
-                        "str q28, [c_ptr4]\n"
-                        "str q29, [c_ptr5]\n"
-                        "str q30, [c_ptr6]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq a_ptr6\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        ".unreq c_ptr6\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 8:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "a_ptr6 .req X5\n"
-                        "a_ptr7 .req X6\n"
-                        "c_ptr1 .req X7\n"
-                        "c_ptr2 .req X8\n"
-                        "c_ptr3 .req X9\n"
-                        "c_ptr4 .req X10\n"
-                        "c_ptr5 .req X11\n"
-                        "c_ptr6 .req X12\n"
-                        "c_ptr7 .req X13\n"
-                        "ldr q24, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v25.16b, v24.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v26.16b, v24.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v27.16b, v24.16b\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "mov v28.16b, v24.16b\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "mov v29.16b, v24.16b\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "mov v30.16b, v24.16b\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "mov v31.16b, v24.16b\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "ldr q4, [a_ptr4]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "ldr q5, [a_ptr5]\n"
-                        "add a_ptr6, a_ptr5, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "ldr q6, [a_ptr6]\n"
-                        "add a_ptr7, a_ptr6, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "ldr q7, [a_ptr7]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add c_ptr6, c_ptr5, %[ldc]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add c_ptr7, c_ptr6, %[ldc]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "add a_ptr7, a_ptr7, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "ldr q13, [a_ptr5]\n"
-                        "fmla v31.4s, v16.4s, v7.s[0]\n"
-                        "ldr q14, [a_ptr6]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q15, [a_ptr7]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v30.4s, v17.4s, v6.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v31.4s, v17.4s, v7.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "add a_ptr6, a_ptr6, #0x20\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "add a_ptr7, a_ptr7, #0x20\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v30.4s, v18.4s, v6.s[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v31.4s, v18.4s, v7.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "ldr q4, [a_ptr4, #-0x10]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "ldr q5, [a_ptr5, #-0x10]\n"
-                        "fmla v30.4s, v19.4s, v6.s[3]\n"
-                        "ldr q6, [a_ptr6, #-0x10]\n"
-                        "fmla v31.4s, v19.4s, v7.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "ldr q7, [a_ptr7, #-0x10]\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v29.4s, v16.4s, v13.s[0]\n"
-                        "fmla v30.4s, v16.4s, v14.s[0]\n"
-                        "fmla v31.4s, v16.4s, v15.s[0]\n"
-                        "ldr q16, [%[b_ptr0], #0x40]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v29.4s, v17.4s, v13.s[1]\n"
-                        "fmla v30.4s, v17.4s, v14.s[1]\n"
-                        "fmla v31.4s, v17.4s, v15.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x50]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v29.4s, v18.4s, v13.s[2]\n"
-                        "fmla v30.4s, v18.4s, v14.s[2]\n"
-                        "fmla v31.4s, v18.4s, v15.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x60]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "fmla v29.4s, v19.4s, v13.s[3]\n"
-                        "fmla v30.4s, v19.4s, v14.s[3]\n"
-                        "fmla v31.4s, v19.4s, v15.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q19, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "prfm PSTL1KEEP, [c_ptr4]\n"
-                        "prfm PSTL1KEEP, [c_ptr5]\n"
-                        "prfm PSTL1KEEP, [c_ptr6]\n"
-                        "prfm PSTL1KEEP, [c_ptr7]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr q8, [%[a_ptr0]]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "ldr q9, [a_ptr1]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "ldr q10, [a_ptr2]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "ldr q11, [a_ptr3]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "ldr q12, [a_ptr4]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "ldr q13, [a_ptr5]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "ldr q14, [a_ptr6]\n"
-                        "fmla v31.4s, v16.4s, v7.s[0]\n"
-                        "ldr q15, [a_ptr7]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "fmla v30.4s, v17.4s, v6.s[1]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "fmla v31.4s, v17.4s, v7.s[1]\n"
-                        "ldr q17, [%[b_ptr0], #0x10]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "add a_ptr7, a_ptr7, #0x10\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "fmla v30.4s, v18.4s, v6.s[2]\n"
-                        "fmla v31.4s, v18.4s, v7.s[2]\n"
-                        "ldr q18, [%[b_ptr0], #0x20]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "fmla v30.4s, v19.4s, v6.s[3]\n"
-                        "fmla v31.4s, v19.4s, v7.s[3]\n"
-                        "ldr q19, [%[b_ptr0], #0x30]\n"
-                        "fmla v24.4s, v16.4s, v8.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        "fmla v25.4s, v16.4s, v9.s[0]\n"
-                        "fmla v26.4s, v16.4s, v10.s[0]\n"
-                        "fmla v27.4s, v16.4s, v11.s[0]\n"
-                        "fmla v28.4s, v16.4s, v12.s[0]\n"
-                        "fmla v29.4s, v16.4s, v13.s[0]\n"
-                        "fmla v30.4s, v16.4s, v14.s[0]\n"
-                        "fmla v31.4s, v16.4s, v15.s[0]\n"
-                        "fmla v24.4s, v17.4s, v8.s[1]\n"
-                        "fmla v25.4s, v17.4s, v9.s[1]\n"
-                        "fmla v26.4s, v17.4s, v10.s[1]\n"
-                        "fmla v27.4s, v17.4s, v11.s[1]\n"
-                        "fmla v28.4s, v17.4s, v12.s[1]\n"
-                        "fmla v29.4s, v17.4s, v13.s[1]\n"
-                        "fmla v30.4s, v17.4s, v14.s[1]\n"
-                        "fmla v31.4s, v17.4s, v15.s[1]\n"
-                        "fmla v24.4s, v18.4s, v8.s[2]\n"
-                        "fmla v25.4s, v18.4s, v9.s[2]\n"
-                        "fmla v26.4s, v18.4s, v10.s[2]\n"
-                        "fmla v27.4s, v18.4s, v11.s[2]\n"
-                        "fmla v28.4s, v18.4s, v12.s[2]\n"
-                        "fmla v29.4s, v18.4s, v13.s[2]\n"
-                        "fmla v30.4s, v18.4s, v14.s[2]\n"
-                        "fmla v31.4s, v18.4s, v15.s[2]\n"
-                        "fmla v24.4s, v19.4s, v8.s[3]\n"
-                        "fmla v25.4s, v19.4s, v9.s[3]\n"
-                        "fmla v26.4s, v19.4s, v10.s[3]\n"
-                        "fmla v27.4s, v19.4s, v11.s[3]\n"
-                        "fmla v28.4s, v19.4s, v12.s[3]\n"
-                        "fmla v29.4s, v19.4s, v13.s[3]\n"
-                        "fmla v30.4s, v19.4s, v14.s[3]\n"
-                        "fmla v31.4s, v19.4s, v15.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "fmla v31.4s, v16.4s, v7.s[0]\n"
-                        "fmla v24.4s, v17.4s, v0.s[1]\n"
-                        "fmla v25.4s, v17.4s, v1.s[1]\n"
-                        "fmla v26.4s, v17.4s, v2.s[1]\n"
-                        "fmla v27.4s, v17.4s, v3.s[1]\n"
-                        "fmla v28.4s, v17.4s, v4.s[1]\n"
-                        "fmla v29.4s, v17.4s, v5.s[1]\n"
-                        "fmla v30.4s, v17.4s, v6.s[1]\n"
-                        "fmla v31.4s, v17.4s, v7.s[1]\n"
-                        "fmla v24.4s, v18.4s, v0.s[2]\n"
-                        "fmla v25.4s, v18.4s, v1.s[2]\n"
-                        "fmla v26.4s, v18.4s, v2.s[2]\n"
-                        "fmla v27.4s, v18.4s, v3.s[2]\n"
-                        "fmla v28.4s, v18.4s, v4.s[2]\n"
-                        "fmla v29.4s, v18.4s, v5.s[2]\n"
-                        "fmla v30.4s, v18.4s, v6.s[2]\n"
-                        "fmla v31.4s, v18.4s, v7.s[2]\n"
-                        "fmla v24.4s, v19.4s, v0.s[3]\n"
-                        "fmla v25.4s, v19.4s, v1.s[3]\n"
-                        "fmla v26.4s, v19.4s, v2.s[3]\n"
-                        "fmla v27.4s, v19.4s, v3.s[3]\n"
-                        "fmla v28.4s, v19.4s, v4.s[3]\n"
-                        "fmla v29.4s, v19.4s, v5.s[3]\n"
-                        "fmla v30.4s, v19.4s, v6.s[3]\n"
-                        "fmla v31.4s, v19.4s, v7.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "ldr q16, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v24.4s, v16.4s, v0.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v25.4s, v16.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v26.4s, v16.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "ldr s4, [a_ptr4]\n"
-                        "fmla v27.4s, v16.4s, v3.s[0]\n"
-                        "add a_ptr4, a_ptr4, #0x4\n"
-                        "ldr s5, [a_ptr5]\n"
-                        "fmla v28.4s, v16.4s, v4.s[0]\n"
-                        "add a_ptr5, a_ptr5, #0x4\n"
-                        "ldr s6, [a_ptr6]\n"
-                        "fmla v29.4s, v16.4s, v5.s[0]\n"
-                        "add a_ptr6, a_ptr6, #0x4\n"
-                        "ldr s7, [a_ptr7]\n"
-                        "fmla v30.4s, v16.4s, v6.s[0]\n"
-                        "add a_ptr7, a_ptr7, #0x4\n"
-                        "fmla v31.4s, v16.4s, v7.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v22.4s}, [%[minptr]]\n"
-                        "ld1r {v23.4s}, [%[maxptr]]\n"
-                        "fmax v24.4s, v24.4s, v22.4s\n"
-                        "fmax v25.4s, v25.4s, v22.4s\n"
-                        "fmax v26.4s, v26.4s, v22.4s\n"
-                        "fmax v27.4s, v27.4s, v22.4s\n"
-                        "fmin v24.4s, v24.4s, v23.4s\n"
-                        "fmin v25.4s, v25.4s, v23.4s\n"
-                        "fmin v26.4s, v26.4s, v23.4s\n"
-                        "fmin v27.4s, v27.4s, v23.4s\n"
-                        "str q24, [%[c_ptr0]]\n"
-                        "fmax v28.4s, v28.4s, v22.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                        "fmax v29.4s, v29.4s, v22.4s\n"
-                        "str q25, [c_ptr1]\n"
-                        "fmax v30.4s, v30.4s, v22.4s\n"
-                        "fmin v28.4s, v28.4s, v23.4s\n"
-                        "fmax v31.4s, v31.4s, v22.4s\n"
-                        "str q26, [c_ptr2]\n"
-                        "fmin v29.4s, v29.4s, v23.4s\n"
-                        "fmin v30.4s, v30.4s, v23.4s\n"
-                        "fmin v31.4s, v31.4s, v23.4s\n"
-                        "str q27, [c_ptr3]\n"
-                        "str q28, [c_ptr4]\n"
-                        "str q29, [c_ptr5]\n"
-                        "str q30, [c_ptr6]\n"
-                        "str q31, [c_ptr7]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq a_ptr6\n"
-                        ".unreq a_ptr7\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        ".unreq c_ptr6\n"
-                        ".unreq c_ptr7\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 8); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 4 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
similarity index 56%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
index 4147ab60dc..37d0b8f62d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,44 +10,49 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
 
 #include "../performance_parameters.hpp"
 #include "../std_transforms_fixed.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<float>, \
+   size_t, size_t, \
+   const float *, \
+   IndirectOutputArg<float>, \
+   const float *, Activation, bool
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_hybrid_fp32_mla_16x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-void a64_hybrid_fp32_mla_16x4_a55(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-void a64_hybrid_fp32_mla_16x4_x1(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_hybrid_fp32_mla_6x16( ARGLIST );
 
-class hybrid_fp32_mla_16x4
+class cls_a64_hybrid_fp32_mla_6x16
 {
 public:
     typedef float operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
     {
-        return 4;
+        return 6;
     }
 
     static unsigned int out_width()
@@ -65,47 +70,33 @@ class hybrid_fp32_mla_16x4
         return true;
     }
 
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
     static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
         switch (ci->get_cpu_model()) {
             case CPUModel::A55r1:
-                return { 2.866 };
+                return { 2.287 };
 
             case CPUModel::A53:
-                return { 1.419 };
+                return { 1.43 };
 
             case CPUModel::A73:
-                return { 2.551 };
+                return { 2.56 };
 
             default:
-                return { 6.25 };
+                return { 6.667 };
         }
     }
 
-    StdTransformsFixed<operand_type, result_type, 4, 16, 1> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 6, 16, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_hybrid_fp32_mla_16x4;
+    kern_type kernel=a64_hybrid_fp32_mla_6x16;
 
-    hybrid_fp32_mla_16x4(const CPUInfo *ci)
+    cls_a64_hybrid_fp32_mla_6x16(const CPUInfo *)
     {
-        if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_hybrid_fp32_mla_16x4_a55;
-        } else if (ci->get_cpu_model() == CPUModel::X1) {
-            kernel = a64_hybrid_fp32_mla_16x4_x1;
-        }
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
new file mode 100644
index 0000000000..884e8986c8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
@@ -0,0 +1,3430 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 171f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 137f\n"
+      "beq 103f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 69f\n"
+      "beq 35f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "cbz x14, 4f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "add x14, x14, #0x40\n"
+      "b 15f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 14f\n"
+      "cmp x16, #0x10\n"
+      "bge 13f\n"
+      "tbz x16, #3, 8f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x16, #2, 6f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 5f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 12f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 12f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 12f\n"
+      "6:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x16, #1, 7f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 12f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 12f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 12f\n"
+      "8:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x16, #2, 10f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 9f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 12f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 12f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "b 12f\n"
+      "10:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x16, #1, 11f\n"
+      "ldr d8, [x13], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 12f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "b 12f\n"
+      "11:"  // Height 1: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "12:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 15f\n"
+      "13:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "b 15f\n"
+      "14:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "15:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "16:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 18f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "b 18f\n"
+      "17:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "18:"  // Height 1: input setup done
+      "cmp x11, #0x4\n"
+      "blt 21f\n"
+      "cmp x11, #0x8\n"
+      "blt 20f\n"
+      "19:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "cmp x11, #0x8\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "bge 19b\n"
+      "20:"  // Height 1: Multiply loop: Single iteration only
+      "sub x11, x11, #0x4\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "21:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 23f\n"
+      "22:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "sub x11, x11, #0x1\n"
+      "add x15, x15, #0x40\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "cbnz x11, 22b\n"
+      "23:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 16b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 24f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "24:"  // Height 1: No activation
+      "cmp x16, #0x10\n"
+      "bge 33f\n"
+      "tbz x16, #3, 28f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x16, #2, 26f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 25f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x16, #0, 32f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 32f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 32f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 32f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 27f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x16, #0, 32f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 32f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 32f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 32f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 30f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 29f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x16, #0, 32f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 32f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 32f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 32f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 31f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x16, #0, 32f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 32f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "32:"  // Height 1: Partial direct writeback: Done
+      "b 34f\n"
+      "33:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "34:"  // Height 1: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 3b\n"
+      "b 206f\n"
+      "35:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 36f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 37f\n"
+      "36:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "37:"  // Height 2: Column loop
+      "cbz x14, 38f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v13.16b, v9.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v14.16b, v10.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v15.16b, v11.16b\n"
+      "b 49f\n"
+      "38:"  // Height 2: no bias
+      "tbz %x[flags], #0, 48f\n"
+      "cmp x16, #0x10\n"
+      "bge 47f\n"
+      "tbz x16, #3, 42f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "tbz x16, #2, 40f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 39f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "tbz x16, #0, 46f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "b 46f\n"
+      "39:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 46f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "b 46f\n"
+      "40:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x16, #1, 41f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 46f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "b 46f\n"
+      "41:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 46f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "b 46f\n"
+      "42:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x16, #2, 44f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 43f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "tbz x16, #0, 46f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "b 46f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 46f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "b 46f\n"
+      "44:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x16, #1, 45f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 46f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "b 46f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "46:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "b 49f\n"
+      "47:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "b 49f\n"
+      "48:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "49:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "50:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 51f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "b 52f\n"
+      "51:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "52:"  // Height 2: input setup done
+      "cmp x11, #0x4\n"
+      "blt 55f\n"
+      "cmp x11, #0x8\n"
+      "blt 54f\n"
+      "53:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "cmp x11, #0x8\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "bge 53b\n"
+      "54:"  // Height 2: Multiply loop: Single iteration only
+      "sub x11, x11, #0x4\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "55:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 57f\n"
+      "56:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "cbnz x11, 56b\n"
+      "57:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 50b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 58f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "58:"  // Height 2: No activation
+      "cmp x16, #0x10\n"
+      "bge 67f\n"
+      "tbz x16, #3, 62f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "tbz x16, #2, 60f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 59f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "tbz x16, #0, 66f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "b 66f\n"
+      "59:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 66f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "b 66f\n"
+      "60:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 61f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "tbz x16, #0, 66f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "b 66f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 66f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "b 66f\n"
+      "62:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 64f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "tbz x16, #1, 63f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "tbz x16, #0, 66f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "b 66f\n"
+      "63:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 66f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "b 66f\n"
+      "64:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 65f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "tbz x16, #0, 66f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "b 66f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "66:"  // Height 2: Partial direct writeback: Done
+      "b 68f\n"
+      "67:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "68:"  // Height 2: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 37b\n"
+      "b 206f\n"
+      "69:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 70f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 71f\n"
+      "70:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "71:"  // Height 3: Column loop
+      "cbz x14, 72f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v13.16b, v9.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "b 83f\n"
+      "72:"  // Height 3: no bias
+      "tbz %x[flags], #0, 82f\n"
+      "cmp x16, #0x10\n"
+      "bge 81f\n"
+      "tbz x16, #3, 76f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "tbz x16, #2, 74f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 73f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "b 80f\n"
+      "73:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 80f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "b 80f\n"
+      "74:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x16, #1, 75f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 80f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "b 80f\n"
+      "75:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 80f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "b 80f\n"
+      "76:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x16, #2, 78f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 77f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "b 80f\n"
+      "77:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 80f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "b 80f\n"
+      "78:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x16, #1, 79f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "b 80f\n"
+      "79:"  // Height 3: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "80:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "b 83f\n"
+      "81:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "b 83f\n"
+      "82:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "83:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "84:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 86f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "b 86f\n"
+      "85:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "86:"  // Height 3: input setup done
+      "cmp x11, #0x4\n"
+      "blt 89f\n"
+      "cmp x11, #0x8\n"
+      "blt 88f\n"
+      "87:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "cmp x11, #0x8\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "bge 87b\n"
+      "88:"  // Height 3: Multiply loop: Single iteration only
+      "sub x11, x11, #0x4\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "89:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 91f\n"
+      "90:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "cbnz x11, 90b\n"
+      "91:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 84b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 92f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "92:"  // Height 3: No activation
+      "cmp x16, #0x10\n"
+      "bge 101f\n"
+      "tbz x16, #3, 96f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "tbz x16, #2, 94f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 93f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "tbz x16, #0, 100f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "b 100f\n"
+      "93:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 100f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "b 100f\n"
+      "94:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 95f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "tbz x16, #0, 100f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "b 100f\n"
+      "95:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 100f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "b 100f\n"
+      "96:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 98f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "tbz x16, #1, 97f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "tbz x16, #0, 100f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "b 100f\n"
+      "97:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 100f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "b 100f\n"
+      "98:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 99f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "tbz x16, #0, 100f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "b 100f\n"
+      "99:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "100:"  // Height 3: Partial direct writeback: Done
+      "b 102f\n"
+      "101:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "102:"  // Height 3: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 71b\n"
+      "b 206f\n"
+      "103:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 104f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 105f\n"
+      "104:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "105:"  // Height 4: Column loop
+      "cbz x14, 106f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "add x14, x14, #0x40\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "b 117f\n"
+      "106:"  // Height 4: no bias
+      "tbz %x[flags], #0, 116f\n"
+      "cmp x16, #0x10\n"
+      "bge 115f\n"
+      "tbz x16, #3, 110f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "tbz x16, #2, 108f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 107f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "tbz x16, #0, 114f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "b 114f\n"
+      "107:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 114f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "b 114f\n"
+      "108:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x16, #1, 109f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 114f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "b 114f\n"
+      "109:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 114f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "b 114f\n"
+      "110:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x16, #2, 112f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 111f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "tbz x16, #0, 114f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "b 114f\n"
+      "111:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 114f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "b 114f\n"
+      "112:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x16, #1, 113f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 114f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "b 114f\n"
+      "113:"  // Height 4: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "114:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "b 117f\n"
+      "115:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "b 117f\n"
+      "116:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "117:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "118:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 119f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 120f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 120f\n"
+      "119:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "120:"  // Height 4: input setup done
+      "cmp x11, #0x4\n"
+      "blt 123f\n"
+      "cmp x11, #0x8\n"
+      "blt 122f\n"
+      "121:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "cmp x11, #0x8\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "bge 121b\n"
+      "122:"  // Height 4: Multiply loop: Single iteration only
+      "sub x11, x11, #0x4\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "123:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 125f\n"
+      "124:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "cbnz x11, 124b\n"
+      "125:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 118b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 126f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "126:"  // Height 4: No activation
+      "cmp x16, #0x10\n"
+      "bge 135f\n"
+      "tbz x16, #3, 130f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "tbz x16, #2, 128f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 127f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "tbz x16, #0, 134f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "b 134f\n"
+      "127:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 134f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "b 134f\n"
+      "128:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 129f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "tbz x16, #0, 134f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "b 134f\n"
+      "129:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 134f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "b 134f\n"
+      "130:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 132f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "tbz x16, #1, 131f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "tbz x16, #0, 134f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "b 134f\n"
+      "131:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 134f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "b 134f\n"
+      "132:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 133f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x16, #0, 134f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "b 134f\n"
+      "133:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "134:"  // Height 4: Partial direct writeback: Done
+      "b 136f\n"
+      "135:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "136:"  // Height 4: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 105b\n"
+      "b 206f\n"
+      "137:"  // Height 5
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 138f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 139f\n"
+      "138:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "139:"  // Height 5: Column loop
+      "cbz x14, 140f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v24.16b, v8.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "b 151f\n"
+      "140:"  // Height 5: no bias
+      "tbz %x[flags], #0, 150f\n"
+      "cmp x16, #0x10\n"
+      "bge 149f\n"
+      "tbz x16, #3, 144f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 142f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 141f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "tbz x16, #0, 148f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "b 148f\n"
+      "141:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 148f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "b 148f\n"
+      "142:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x16, #1, 143f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 148f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "b 148f\n"
+      "143:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 148f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "b 148f\n"
+      "144:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x16, #2, 146f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 145f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "tbz x16, #0, 148f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "b 148f\n"
+      "145:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 148f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "b 148f\n"
+      "146:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x16, #1, 147f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 148f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "b 148f\n"
+      "147:"  // Height 5: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "148:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "b 151f\n"
+      "149:"  // Height 5: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "b 151f\n"
+      "150:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "151:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "152:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 153f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 154f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 154f\n"
+      "153:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "154:"  // Height 5: input setup done
+      "cmp x11, #0x4\n"
+      "blt 157f\n"
+      "cmp x11, #0x8\n"
+      "blt 156f\n"
+      "155:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "cmp x11, #0x8\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "fmla v24.4s, v6.4s, v4.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "fmla v25.4s, v7.4s, v4.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "fmla v26.4s, v6.4s, v4.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "fmla v27.4s, v7.4s, v4.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "fmla v24.4s, v6.4s, v4.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "fmla v25.4s, v7.4s, v4.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "fmla v26.4s, v6.4s, v4.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "fmla v27.4s, v7.4s, v4.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "fmla v24.4s, v6.4s, v4.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "fmla v25.4s, v7.4s, v4.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "bge 155b\n"
+      "156:"  // Height 5: Multiply loop: Single iteration only
+      "sub x11, x11, #0x4\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "fmla v24.4s, v6.4s, v4.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "fmla v25.4s, v7.4s, v4.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "fmla v26.4s, v6.4s, v4.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "fmla v27.4s, v7.4s, v4.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "fmla v24.4s, v6.4s, v4.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "fmla v25.4s, v7.4s, v4.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "fmla v26.4s, v6.4s, v4.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "fmla v27.4s, v7.4s, v4.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "fmla v24.4s, v6.4s, v4.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "fmla v25.4s, v7.4s, v4.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "157:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x11, 159f\n"
+      "158:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "cbnz x11, 158b\n"
+      "159:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 152b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 160f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "160:"  // Height 5: No activation
+      "cmp x16, #0x10\n"
+      "bge 169f\n"
+      "tbz x16, #3, 164f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 162f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 161f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "tbz x16, #0, 168f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "b 168f\n"
+      "161:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 168f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "b 168f\n"
+      "162:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 163f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "tbz x16, #0, 168f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "b 168f\n"
+      "163:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 168f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "b 168f\n"
+      "164:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 166f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 165f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "tbz x16, #0, 168f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "b 168f\n"
+      "165:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 168f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "b 168f\n"
+      "166:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 167f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x16, #0, 168f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "b 168f\n"
+      "167:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "168:"  // Height 5: Partial direct writeback: Done
+      "b 170f\n"
+      "169:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "170:"  // Height 5: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 139b\n"
+      "b 206f\n"
+      "171:"  // Height 6
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 172f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 173f\n"
+      "172:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "173:"  // Height 6: Column loop
+      "cbz x14, 174f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "mov v12.16b, v8.16b\n"
+      "ldr q9, [x14, #0x10]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v24.16b, v8.16b\n"
+      "add x14, x14, #0x40\n"
+      "mov v28.16b, v8.16b\n"
+      "mov v13.16b, v9.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v29.16b, v9.16b\n"
+      "mov v30.16b, v10.16b\n"
+      "mov v31.16b, v11.16b\n"
+      "b 185f\n"
+      "174:"  // Height 6: no bias
+      "tbz %x[flags], #0, 184f\n"
+      "cmp x16, #0x10\n"
+      "bge 183f\n"
+      "tbz x16, #3, 178f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 176f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 175f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x16, #0, 182f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 182f\n"
+      "175:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 182f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 182f\n"
+      "176:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x16, #1, 177f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 182f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 182f\n"
+      "177:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 182f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 182f\n"
+      "178:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x16, #2, 180f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 179f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x16, #0, 182f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 182f\n"
+      "179:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 182f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 182f\n"
+      "180:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x16, #1, 181f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 182f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 182f\n"
+      "181:"  // Height 6: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "182:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "sub x21, x21, x19\n"
+      "b 185f\n"
+      "183:"  // Height 6: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "b 185f\n"
+      "184:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "185:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "186:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 187f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 188f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 188f\n"
+      "187:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "add x20, x22, x19, LSL #2\n"
+      "188:"  // Height 6: input setup done
+      "cmp x11, #0x4\n"
+      "blt 191f\n"
+      "cmp x11, #0x8\n"
+      "blt 190f\n"
+      "189:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v28.4s, v6.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "cmp x11, #0x8\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "fmla v29.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "fmla v30.4s, v6.4s, v5.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "fmla v31.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "fmla v24.4s, v6.4s, v4.s[1]\n"
+      "fmla v28.4s, v6.4s, v5.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "fmla v25.4s, v7.4s, v4.s[1]\n"
+      "fmla v29.4s, v7.4s, v5.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "fmla v26.4s, v6.4s, v4.s[1]\n"
+      "fmla v30.4s, v6.4s, v5.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "fmla v27.4s, v7.4s, v4.s[1]\n"
+      "fmla v31.4s, v7.4s, v5.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "fmla v24.4s, v6.4s, v4.s[2]\n"
+      "fmla v28.4s, v6.4s, v5.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "fmla v25.4s, v7.4s, v4.s[2]\n"
+      "fmla v29.4s, v7.4s, v5.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "fmla v26.4s, v6.4s, v4.s[2]\n"
+      "fmla v30.4s, v6.4s, v5.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "fmla v27.4s, v7.4s, v4.s[2]\n"
+      "fmla v31.4s, v7.4s, v5.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "fmla v24.4s, v6.4s, v4.s[3]\n"
+      "fmla v28.4s, v6.4s, v5.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "fmla v25.4s, v7.4s, v4.s[3]\n"
+      "fmla v29.4s, v7.4s, v5.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v30.4s, v6.4s, v5.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "fmla v31.4s, v7.4s, v5.s[3]\n"
+      "bge 189b\n"
+      "190:"  // Height 6: Multiply loop: Single iteration only
+      "sub x11, x11, #0x4\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v28.4s, v6.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "fmla v29.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "fmla v30.4s, v6.4s, v5.s[0]\n"
+      "ldr q6, [x15, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "fmla v31.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x15, #0x50]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v12.4s, v6.4s, v1.s[1]\n"
+      "fmla v16.4s, v6.4s, v2.s[1]\n"
+      "fmla v20.4s, v6.4s, v3.s[1]\n"
+      "fmla v24.4s, v6.4s, v4.s[1]\n"
+      "fmla v28.4s, v6.4s, v5.s[1]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v1.s[1]\n"
+      "fmla v17.4s, v7.4s, v2.s[1]\n"
+      "fmla v21.4s, v7.4s, v3.s[1]\n"
+      "fmla v25.4s, v7.4s, v4.s[1]\n"
+      "fmla v29.4s, v7.4s, v5.s[1]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v10.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v18.4s, v6.4s, v2.s[1]\n"
+      "fmla v22.4s, v6.4s, v3.s[1]\n"
+      "fmla v26.4s, v6.4s, v4.s[1]\n"
+      "fmla v30.4s, v6.4s, v5.s[1]\n"
+      "ldr q6, [x15, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v19.4s, v7.4s, v2.s[1]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "fmla v27.4s, v7.4s, v4.s[1]\n"
+      "fmla v31.4s, v7.4s, v5.s[1]\n"
+      "ldr q7, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[2]\n"
+      "fmla v12.4s, v6.4s, v1.s[2]\n"
+      "fmla v16.4s, v6.4s, v2.s[2]\n"
+      "fmla v20.4s, v6.4s, v3.s[2]\n"
+      "fmla v24.4s, v6.4s, v4.s[2]\n"
+      "fmla v28.4s, v6.4s, v5.s[2]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[2]\n"
+      "fmla v13.4s, v7.4s, v1.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[2]\n"
+      "fmla v21.4s, v7.4s, v3.s[2]\n"
+      "fmla v25.4s, v7.4s, v4.s[2]\n"
+      "fmla v29.4s, v7.4s, v5.s[2]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v14.4s, v6.4s, v1.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[2]\n"
+      "fmla v22.4s, v6.4s, v3.s[2]\n"
+      "fmla v26.4s, v6.4s, v4.s[2]\n"
+      "fmla v30.4s, v6.4s, v5.s[2]\n"
+      "ldr q6, [x15, #0xc0]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v15.4s, v7.4s, v1.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[2]\n"
+      "fmla v23.4s, v7.4s, v3.s[2]\n"
+      "fmla v27.4s, v7.4s, v4.s[2]\n"
+      "fmla v31.4s, v7.4s, v5.s[2]\n"
+      "ldr q7, [x15, #0xd0]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v1.s[3]\n"
+      "fmla v16.4s, v6.4s, v2.s[3]\n"
+      "fmla v20.4s, v6.4s, v3.s[3]\n"
+      "fmla v24.4s, v6.4s, v4.s[3]\n"
+      "fmla v28.4s, v6.4s, v5.s[3]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v1.s[3]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "fmla v21.4s, v7.4s, v3.s[3]\n"
+      "fmla v25.4s, v7.4s, v4.s[3]\n"
+      "fmla v29.4s, v7.4s, v5.s[3]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v10.4s, v6.4s, v0.s[3]\n"
+      "add x15, x15, #0x100\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v22.4s, v6.4s, v3.s[3]\n"
+      "fmla v26.4s, v6.4s, v4.s[3]\n"
+      "fmla v30.4s, v6.4s, v5.s[3]\n"
+      "fmla v11.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v23.4s, v7.4s, v3.s[3]\n"
+      "fmla v27.4s, v7.4s, v4.s[3]\n"
+      "fmla v31.4s, v7.4s, v5.s[3]\n"
+      "191:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x11, 193f\n"
+      "192:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x20], #0x4\n"
+      "ldr q6, [x15, #0x0]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x10]\n"
+      "fmla v12.4s, v6.4s, v1.s[0]\n"
+      "sub x11, x11, #0x1\n"
+      "fmla v16.4s, v6.4s, v2.s[0]\n"
+      "fmla v20.4s, v6.4s, v3.s[0]\n"
+      "fmla v24.4s, v6.4s, v4.s[0]\n"
+      "fmla v28.4s, v6.4s, v5.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v1.s[0]\n"
+      "fmla v17.4s, v7.4s, v2.s[0]\n"
+      "fmla v21.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v4.s[0]\n"
+      "fmla v29.4s, v7.4s, v5.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x40\n"
+      "fmla v14.4s, v6.4s, v1.s[0]\n"
+      "fmla v18.4s, v6.4s, v2.s[0]\n"
+      "fmla v22.4s, v6.4s, v3.s[0]\n"
+      "fmla v26.4s, v6.4s, v4.s[0]\n"
+      "fmla v30.4s, v6.4s, v5.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v15.4s, v7.4s, v1.s[0]\n"
+      "fmla v19.4s, v7.4s, v2.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[0]\n"
+      "fmla v27.4s, v7.4s, v4.s[0]\n"
+      "fmla v31.4s, v7.4s, v5.s[0]\n"
+      "cbnz x11, 192b\n"
+      "193:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 186b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 194f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmin v28.4s, v28.4s, v0.4s\n"
+      "fmin v29.4s, v29.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "fmax v28.4s, v28.4s, v1.4s\n"
+      "fmax v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v0.4s\n"
+      "fmin v31.4s, v31.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v1.4s\n"
+      "fmax v31.4s, v31.4s, v1.4s\n"
+      "194:"  // Height 6: No activation
+      "cmp x16, #0x10\n"
+      "bge 203f\n"
+      "tbz x16, #3, 198f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 196f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 195f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x16, #0, 202f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 202f\n"
+      "195:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 202f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "b 202f\n"
+      "196:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 197f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x16, #0, 202f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "b 202f\n"
+      "197:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 202f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "b 202f\n"
+      "198:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 200f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 199f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x16, #0, 202f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "b 202f\n"
+      "199:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 202f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "b 202f\n"
+      "200:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 201f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x16, #0, 202f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "b 202f\n"
+      "201:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "202:"  // Height 6: Partial direct writeback: Done
+      "b 204f\n"
+      "203:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "add x21, x21, #0x40\n"
+      "204:"  // Height 6: Writeback done
+      "subs x16, x16, #0x10\n"
+      "bgt 173b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 206f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 205f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "205:"  // Update direct input
+      "mov x19, #0x18\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "206:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
new file mode 100644
index 0000000000..043d0643f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<float>, \
+   size_t, size_t, \
+   const float *, \
+   IndirectOutputArg<float>, \
+   const float *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_fp32_mla_8x4( ARGLIST );
+
+class cls_a64_hybrid_fp32_mla_8x4
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_fp32_mla_8x4;
+
+    cls_a64_hybrid_fp32_mla_8x4(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
new file mode 100644
index 0000000000..3ab6cad368
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
@@ -0,0 +1,2195 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_8x4 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x8\n"
+      "bge 155f\n"
+      "cmp %x[M], #0x6\n"
+      "bgt 133f\n"
+      "beq 111f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 89f\n"
+      "beq 67f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 45f\n"
+      "beq 23f\n"
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "cbz x8, 4f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "add x8, x8, #0x10\n"
+      "b 9f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 8f\n"
+      "cmp x6, #0x4\n"
+      "bge 7f\n"
+      "tbz x6, #1, 5f\n"
+      "ldr d24, [x17], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 6f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "b 6f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "6:"  // Height 1: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "b 9f\n"
+      "7:"  // Height 1: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "b 9f\n"
+      "8:"  // Height 1: no accumulate
+      "movi v24.16b, #0x0\n"
+      "9:"  // Height 1: setup done
+      "mov x16, #0x0\n"
+      "10:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 11f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "cbnz x16, 12f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "b 12f\n"
+      "11:"  // Height 1: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "12:"  // Height 1: input setup done
+      "cmp x15, #0x4\n"
+      "blt 15f\n"
+      "cmp x15, #0x8\n"
+      "blt 14f\n"
+      "13:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "add x14, x14, #0x10\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "cmp x15, #0x8\n"
+      "add x7, x7, #0x40\n"
+      "bge 13b\n"
+      "14:"  // Height 1: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "add x14, x14, #0x10\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "15:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x15, 17f\n"
+      "16:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "add x7, x7, #0x10\n"
+      "cbnz x15, 16b\n"
+      "17:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 10b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "tbz %x[flags], #1, 18f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "18:"  // Height 1: No activation
+      "cmp x6, #0x4\n"
+      "bge 21f\n"
+      "tbz x6, #1, 19f\n"
+      "str d24, [x17], #0x8\n"
+      "tbz x6, #0, 20f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "b 20f\n"
+      "19:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "20:"  // Height 1: Partial direct writeback: Done
+      "b 22f\n"
+      "21:"  // Height 1: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "22:"  // Height 1: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 3b\n"
+      "b 178f\n"
+      "23:"  // Height 2
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 24f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 25f\n"
+      "24:"  // Height 2: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "25:"  // Height 2: Column loop
+      "cbz x8, 26f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "b 31f\n"
+      "26:"  // Height 2: no bias
+      "tbz %x[flags], #0, 30f\n"
+      "cmp x6, #0x4\n"
+      "bge 29f\n"
+      "tbz x6, #1, 27f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 28f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "b 28f\n"
+      "27:"  // Height 2: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "28:"  // Height 2: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "b 31f\n"
+      "29:"  // Height 2: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "b 31f\n"
+      "30:"  // Height 2: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "31:"  // Height 2: setup done
+      "mov x16, #0x0\n"
+      "32:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 33f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "cbnz x16, 34f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "b 34f\n"
+      "33:"  // Height 2: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "34:"  // Height 2: input setup done
+      "cmp x15, #0x4\n"
+      "blt 37f\n"
+      "cmp x15, #0x8\n"
+      "blt 36f\n"
+      "35:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "cmp x15, #0x8\n"
+      "add x7, x7, #0x40\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "bge 35b\n"
+      "36:"  // Height 2: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "37:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x15, 39f\n"
+      "38:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "cbnz x15, 38b\n"
+      "39:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 32b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 40f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "40:"  // Height 2: No activation
+      "cmp x6, #0x4\n"
+      "bge 43f\n"
+      "tbz x6, #1, 41f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "tbz x6, #0, 42f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "b 42f\n"
+      "41:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "42:"  // Height 2: Partial direct writeback: Done
+      "b 44f\n"
+      "43:"  // Height 2: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "44:"  // Height 2: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 25b\n"
+      "b 178f\n"
+      "45:"  // Height 3
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 46f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "add x11, x11, x19, LSL #2\n"
+      "b 47f\n"
+      "46:"  // Height 3: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "47:"  // Height 3: Column loop
+      "cbz x8, 48f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "mov v26.16b, v24.16b\n"
+      "b 53f\n"
+      "48:"  // Height 3: no bias
+      "tbz %x[flags], #0, 52f\n"
+      "cmp x6, #0x4\n"
+      "bge 51f\n"
+      "tbz x6, #1, 49f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x11], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 50f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x11]\n"
+      "b 50f\n"
+      "49:"  // Height 3: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x11, #0x0]\n"
+      "50:"  // Height 3: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "sub x11, x11, x19\n"
+      "b 53f\n"
+      "51:"  // Height 3: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x11, #0x0]\n"
+      "b 53f\n"
+      "52:"  // Height 3: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "53:"  // Height 3: setup done
+      "mov x16, #0x0\n"
+      "54:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 55f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "cbnz x16, 56f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "b 56f\n"
+      "55:"  // Height 3: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "56:"  // Height 3: input setup done
+      "cmp x15, #0x4\n"
+      "blt 59f\n"
+      "cmp x15, #0x8\n"
+      "blt 58f\n"
+      "57:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v26.4s, v8.4s, v2.s[0]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "add x14, x14, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "cmp x15, #0x8\n"
+      "add x7, x7, #0x40\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "bge 57b\n"
+      "58:"  // Height 3: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "add x14, x14, #0x10\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v26.4s, v13.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v26.4s, v14.4s, v2.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "fmla v26.4s, v15.4s, v2.s[3]\n"
+      "59:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x15, 61f\n"
+      "60:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s2, [x10], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "fmla v26.4s, v16.4s, v2.s[0]\n"
+      "cbnz x15, 60b\n"
+      "61:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 54b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "tbz %x[flags], #1, 62f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "62:"  // Height 3: No activation
+      "cmp x6, #0x4\n"
+      "bge 65f\n"
+      "tbz x6, #1, 63f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x11], #0x8\n"
+      "tbz x6, #0, 64f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x11]\n"
+      "b 64f\n"
+      "63:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x11, #0x0]\n"
+      "64:"  // Height 3: Partial direct writeback: Done
+      "b 66f\n"
+      "65:"  // Height 3: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x11, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "66:"  // Height 3: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 47b\n"
+      "b 178f\n"
+      "67:"  // Height 4
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 68f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 69f\n"
+      "68:"  // Height 4: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "69:"  // Height 4: Column loop
+      "cbz x8, 70f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "mov v26.16b, v24.16b\n"
+      "mov v27.16b, v24.16b\n"
+      "b 75f\n"
+      "70:"  // Height 4: no bias
+      "tbz %x[flags], #0, 74f\n"
+      "cmp x6, #0x4\n"
+      "bge 73f\n"
+      "tbz x6, #1, 71f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x11], #0x8\n"
+      "ldr d27, [x9], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 72f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x11]\n"
+      "ld1 { v27.s }[2], [x9]\n"
+      "b 72f\n"
+      "71:"  // Height 4: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x11, #0x0]\n"
+      "ldr s27, [x9, #0x0]\n"
+      "72:"  // Height 4: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "sub x11, x11, x19\n"
+      "sub x9, x9, x19\n"
+      "b 75f\n"
+      "73:"  // Height 4: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x11, #0x0]\n"
+      "ldr q27, [x9, #0x0]\n"
+      "b 75f\n"
+      "74:"  // Height 4: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "75:"  // Height 4: setup done
+      "mov x16, #0x0\n"
+      "76:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 77f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "cbnz x16, 78f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "b 78f\n"
+      "77:"  // Height 4: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "78:"  // Height 4: input setup done
+      "cmp x15, #0x4\n"
+      "blt 81f\n"
+      "cmp x15, #0x8\n"
+      "blt 80f\n"
+      "79:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v26.4s, v8.4s, v2.s[0]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "fmla v27.4s, v8.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "add x12, x12, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "cmp x15, #0x8\n"
+      "add x7, x7, #0x40\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "fmla v27.4s, v11.4s, v3.s[3]\n"
+      "bge 79b\n"
+      "80:"  // Height 4: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "add x12, x12, #0x10\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v26.4s, v13.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v27.4s, v13.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v26.4s, v14.4s, v2.s[2]\n"
+      "fmla v27.4s, v14.4s, v3.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "fmla v26.4s, v15.4s, v2.s[3]\n"
+      "fmla v27.4s, v15.4s, v3.s[3]\n"
+      "81:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x15, 83f\n"
+      "82:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s2, [x10], #0x4\n"
+      "ldr s3, [x28], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "fmla v26.4s, v16.4s, v2.s[0]\n"
+      "fmla v27.4s, v16.4s, v3.s[0]\n"
+      "cbnz x15, 82b\n"
+      "83:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 76b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 84f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "84:"  // Height 4: No activation
+      "cmp x6, #0x4\n"
+      "bge 87f\n"
+      "tbz x6, #1, 85f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x11], #0x8\n"
+      "str d27, [x9], #0x8\n"
+      "tbz x6, #0, 86f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x11]\n"
+      "st1 { v27.s }[2], [x9]\n"
+      "b 86f\n"
+      "85:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x11, #0x0]\n"
+      "str s27, [x9, #0x0]\n"
+      "86:"  // Height 4: Partial direct writeback: Done
+      "b 88f\n"
+      "87:"  // Height 4: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x11, #0x0]\n"
+      "str q27, [x9, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "88:"  // Height 4: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 69b\n"
+      "b 178f\n"
+      "89:"  // Height 5
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 90f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 91f\n"
+      "90:"  // Height 5: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "91:"  // Height 5: Column loop
+      "cbz x8, 92f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "mov v26.16b, v24.16b\n"
+      "mov v27.16b, v24.16b\n"
+      "mov v28.16b, v24.16b\n"
+      "b 97f\n"
+      "92:"  // Height 5: no bias
+      "tbz %x[flags], #0, 96f\n"
+      "cmp x6, #0x4\n"
+      "bge 95f\n"
+      "tbz x6, #1, 93f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x11], #0x8\n"
+      "ldr d27, [x9], #0x8\n"
+      "ldr d28, [x27], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 94f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x11]\n"
+      "ld1 { v27.s }[2], [x9]\n"
+      "ld1 { v28.s }[2], [x27]\n"
+      "b 94f\n"
+      "93:"  // Height 5: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x11, #0x0]\n"
+      "ldr s27, [x9, #0x0]\n"
+      "ldr s28, [x27, #0x0]\n"
+      "94:"  // Height 5: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "sub x11, x11, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "b 97f\n"
+      "95:"  // Height 5: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x11, #0x0]\n"
+      "ldr q27, [x9, #0x0]\n"
+      "ldr q28, [x27, #0x0]\n"
+      "b 97f\n"
+      "96:"  // Height 5: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "97:"  // Height 5: setup done
+      "mov x16, #0x0\n"
+      "98:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 99f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "cbnz x16, 100f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "b 100f\n"
+      "99:"  // Height 5: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "100:"  // Height 5: input setup done
+      "cmp x15, #0x4\n"
+      "blt 103f\n"
+      "cmp x15, #0x8\n"
+      "blt 102f\n"
+      "101:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v26.4s, v8.4s, v2.s[0]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "fmla v27.4s, v8.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v8.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "cmp x15, #0x8\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "fmla v28.4s, v10.4s, v4.s[2]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "fmla v27.4s, v11.4s, v3.s[3]\n"
+      "fmla v28.4s, v11.4s, v4.s[3]\n"
+      "bge 101b\n"
+      "102:"  // Height 5: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v12.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v26.4s, v13.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v27.4s, v13.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v28.4s, v13.4s, v4.s[1]\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v26.4s, v14.4s, v2.s[2]\n"
+      "fmla v27.4s, v14.4s, v3.s[2]\n"
+      "fmla v28.4s, v14.4s, v4.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "fmla v26.4s, v15.4s, v2.s[3]\n"
+      "fmla v27.4s, v15.4s, v3.s[3]\n"
+      "fmla v28.4s, v15.4s, v4.s[3]\n"
+      "103:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x15, 105f\n"
+      "104:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s2, [x10], #0x4\n"
+      "ldr s3, [x28], #0x4\n"
+      "ldr s4, [x26], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "fmla v26.4s, v16.4s, v2.s[0]\n"
+      "fmla v27.4s, v16.4s, v3.s[0]\n"
+      "fmla v28.4s, v16.4s, v4.s[0]\n"
+      "cbnz x15, 104b\n"
+      "105:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 98b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 106f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "106:"  // Height 5: No activation
+      "cmp x6, #0x4\n"
+      "bge 109f\n"
+      "tbz x6, #1, 107f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x11], #0x8\n"
+      "str d27, [x9], #0x8\n"
+      "str d28, [x27], #0x8\n"
+      "tbz x6, #0, 108f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x11]\n"
+      "st1 { v27.s }[2], [x9]\n"
+      "st1 { v28.s }[2], [x27]\n"
+      "b 108f\n"
+      "107:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x11, #0x0]\n"
+      "str s27, [x9, #0x0]\n"
+      "str s28, [x27, #0x0]\n"
+      "108:"  // Height 5: Partial direct writeback: Done
+      "b 110f\n"
+      "109:"  // Height 5: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x11, #0x0]\n"
+      "str q27, [x9, #0x0]\n"
+      "str q28, [x27, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "110:"  // Height 5: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 91b\n"
+      "b 178f\n"
+      "111:"  // Height 6
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 112f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x28]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 113f\n"
+      "112:"  // Height 6: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "113:"  // Height 6: Column loop
+      "cbz x8, 114f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "mov v26.16b, v24.16b\n"
+      "mov v27.16b, v24.16b\n"
+      "mov v28.16b, v24.16b\n"
+      "mov v29.16b, v24.16b\n"
+      "b 119f\n"
+      "114:"  // Height 6: no bias
+      "tbz %x[flags], #0, 118f\n"
+      "cmp x6, #0x4\n"
+      "bge 117f\n"
+      "tbz x6, #1, 115f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x11], #0x8\n"
+      "ldr d27, [x9], #0x8\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d29, [x25], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 116f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x11]\n"
+      "ld1 { v27.s }[2], [x9]\n"
+      "ld1 { v28.s }[2], [x27]\n"
+      "ld1 { v29.s }[2], [x25]\n"
+      "b 116f\n"
+      "115:"  // Height 6: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x11, #0x0]\n"
+      "ldr s27, [x9, #0x0]\n"
+      "ldr s28, [x27, #0x0]\n"
+      "ldr s29, [x25, #0x0]\n"
+      "116:"  // Height 6: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "sub x11, x11, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "b 119f\n"
+      "117:"  // Height 6: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x11, #0x0]\n"
+      "ldr q27, [x9, #0x0]\n"
+      "ldr q28, [x27, #0x0]\n"
+      "ldr q29, [x25, #0x0]\n"
+      "b 119f\n"
+      "118:"  // Height 6: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "119:"  // Height 6: setup done
+      "mov x16, #0x0\n"
+      "120:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 121f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "ldr x24, [x20, #0x28]\n"
+      "cbnz x16, 122f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 122f\n"
+      "121:"  // Height 6: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "122:"  // Height 6: input setup done
+      "cmp x15, #0x4\n"
+      "blt 125f\n"
+      "cmp x15, #0x8\n"
+      "blt 124f\n"
+      "123:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q5, [x24, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v26.4s, v8.4s, v2.s[0]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "fmla v27.4s, v8.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v8.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v29.4s, v8.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "cmp x15, #0x8\n"
+      "fmla v29.4s, v9.4s, v5.s[1]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "fmla v28.4s, v10.4s, v4.s[2]\n"
+      "fmla v29.4s, v10.4s, v5.s[2]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "fmla v27.4s, v11.4s, v3.s[3]\n"
+      "fmla v28.4s, v11.4s, v4.s[3]\n"
+      "fmla v29.4s, v11.4s, v5.s[3]\n"
+      "bge 123b\n"
+      "124:"  // Height 6: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q5, [x24, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v12.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v29.4s, v12.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v26.4s, v13.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v27.4s, v13.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v28.4s, v13.4s, v4.s[1]\n"
+      "fmla v29.4s, v13.4s, v5.s[1]\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v26.4s, v14.4s, v2.s[2]\n"
+      "fmla v27.4s, v14.4s, v3.s[2]\n"
+      "fmla v28.4s, v14.4s, v4.s[2]\n"
+      "fmla v29.4s, v14.4s, v5.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "fmla v26.4s, v15.4s, v2.s[3]\n"
+      "fmla v27.4s, v15.4s, v3.s[3]\n"
+      "fmla v28.4s, v15.4s, v4.s[3]\n"
+      "fmla v29.4s, v15.4s, v5.s[3]\n"
+      "125:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x15, 127f\n"
+      "126:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s2, [x10], #0x4\n"
+      "ldr s3, [x28], #0x4\n"
+      "ldr s4, [x26], #0x4\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "fmla v26.4s, v16.4s, v2.s[0]\n"
+      "fmla v27.4s, v16.4s, v3.s[0]\n"
+      "fmla v28.4s, v16.4s, v4.s[0]\n"
+      "fmla v29.4s, v16.4s, v5.s[0]\n"
+      "cbnz x15, 126b\n"
+      "127:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 120b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 128f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "fmax v29.4s, v29.4s, v17.4s\n"
+      "128:"  // Height 6: No activation
+      "cmp x6, #0x4\n"
+      "bge 131f\n"
+      "tbz x6, #1, 129f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x11], #0x8\n"
+      "str d27, [x9], #0x8\n"
+      "str d28, [x27], #0x8\n"
+      "str d29, [x25], #0x8\n"
+      "tbz x6, #0, 130f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x11]\n"
+      "st1 { v27.s }[2], [x9]\n"
+      "st1 { v28.s }[2], [x27]\n"
+      "st1 { v29.s }[2], [x25]\n"
+      "b 130f\n"
+      "129:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x11, #0x0]\n"
+      "str s27, [x9, #0x0]\n"
+      "str s28, [x27, #0x0]\n"
+      "str s29, [x25, #0x0]\n"
+      "130:"  // Height 6: Partial direct writeback: Done
+      "b 132f\n"
+      "131:"  // Height 6: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x11, #0x0]\n"
+      "str q27, [x9, #0x0]\n"
+      "str q28, [x27, #0x0]\n"
+      "str q29, [x25, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "132:"  // Height 6: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 113b\n"
+      "b 178f\n"
+      "133:"  // Height 7
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 134f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x28]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x23, [%x[output_ptr], #0x30]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 135f\n"
+      "134:"  // Height 7: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "135:"  // Height 7: Column loop
+      "cbz x8, 136f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "mov v26.16b, v24.16b\n"
+      "mov v27.16b, v24.16b\n"
+      "mov v28.16b, v24.16b\n"
+      "mov v29.16b, v24.16b\n"
+      "mov v30.16b, v24.16b\n"
+      "b 141f\n"
+      "136:"  // Height 7: no bias
+      "tbz %x[flags], #0, 140f\n"
+      "cmp x6, #0x4\n"
+      "bge 139f\n"
+      "tbz x6, #1, 137f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x11], #0x8\n"
+      "ldr d27, [x9], #0x8\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d29, [x25], #0x8\n"
+      "ldr d30, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 138f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x11]\n"
+      "ld1 { v27.s }[2], [x9]\n"
+      "ld1 { v28.s }[2], [x27]\n"
+      "ld1 { v29.s }[2], [x25]\n"
+      "ld1 { v30.s }[2], [x23]\n"
+      "b 138f\n"
+      "137:"  // Height 7: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x11, #0x0]\n"
+      "ldr s27, [x9, #0x0]\n"
+      "ldr s28, [x27, #0x0]\n"
+      "ldr s29, [x25, #0x0]\n"
+      "ldr s30, [x23, #0x0]\n"
+      "138:"  // Height 7: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "sub x11, x11, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "b 141f\n"
+      "139:"  // Height 7: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x11, #0x0]\n"
+      "ldr q27, [x9, #0x0]\n"
+      "ldr q28, [x27, #0x0]\n"
+      "ldr q29, [x25, #0x0]\n"
+      "ldr q30, [x23, #0x0]\n"
+      "b 141f\n"
+      "140:"  // Height 7: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "141:"  // Height 7: setup done
+      "mov x16, #0x0\n"
+      "142:"  // Height 7: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 143f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "ldr x24, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "cbnz x16, 144f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 144f\n"
+      "143:"  // Height 7: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "144:"  // Height 7: input setup done
+      "cmp x15, #0x4\n"
+      "blt 147f\n"
+      "cmp x15, #0x8\n"
+      "blt 146f\n"
+      "145:"  // Height 7: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q5, [x24, #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v26.4s, v8.4s, v2.s[0]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "fmla v27.4s, v8.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v8.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v29.4s, v8.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v30.4s, v8.4s, v6.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "cmp x15, #0x8\n"
+      "fmla v29.4s, v9.4s, v5.s[1]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v30.4s, v9.4s, v6.s[1]\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "fmla v28.4s, v10.4s, v4.s[2]\n"
+      "fmla v29.4s, v10.4s, v5.s[2]\n"
+      "fmla v30.4s, v10.4s, v6.s[2]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "fmla v27.4s, v11.4s, v3.s[3]\n"
+      "fmla v28.4s, v11.4s, v4.s[3]\n"
+      "fmla v29.4s, v11.4s, v5.s[3]\n"
+      "fmla v30.4s, v11.4s, v6.s[3]\n"
+      "bge 145b\n"
+      "146:"  // Height 7: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q5, [x24, #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v12.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v29.4s, v12.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v30.4s, v12.4s, v6.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v26.4s, v13.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v27.4s, v13.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v28.4s, v13.4s, v4.s[1]\n"
+      "fmla v29.4s, v13.4s, v5.s[1]\n"
+      "fmla v30.4s, v13.4s, v6.s[1]\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v26.4s, v14.4s, v2.s[2]\n"
+      "fmla v27.4s, v14.4s, v3.s[2]\n"
+      "fmla v28.4s, v14.4s, v4.s[2]\n"
+      "fmla v29.4s, v14.4s, v5.s[2]\n"
+      "fmla v30.4s, v14.4s, v6.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "fmla v26.4s, v15.4s, v2.s[3]\n"
+      "fmla v27.4s, v15.4s, v3.s[3]\n"
+      "fmla v28.4s, v15.4s, v4.s[3]\n"
+      "fmla v29.4s, v15.4s, v5.s[3]\n"
+      "fmla v30.4s, v15.4s, v6.s[3]\n"
+      "147:"  // Height 7: Multiply loop: Main loop skip
+      "cbz x15, 149f\n"
+      "148:"  // Height 7: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s2, [x10], #0x4\n"
+      "ldr s3, [x28], #0x4\n"
+      "ldr s4, [x26], #0x4\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s6, [x22], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "fmla v26.4s, v16.4s, v2.s[0]\n"
+      "fmla v27.4s, v16.4s, v3.s[0]\n"
+      "fmla v28.4s, v16.4s, v4.s[0]\n"
+      "fmla v29.4s, v16.4s, v5.s[0]\n"
+      "fmla v30.4s, v16.4s, v6.s[0]\n"
+      "cbnz x15, 148b\n"
+      "149:"  // Height 7: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 142b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 150f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v16.4s\n"
+      "fmin v30.4s, v30.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "fmax v29.4s, v29.4s, v17.4s\n"
+      "fmax v30.4s, v30.4s, v17.4s\n"
+      "150:"  // Height 7: No activation
+      "cmp x6, #0x4\n"
+      "bge 153f\n"
+      "tbz x6, #1, 151f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x11], #0x8\n"
+      "str d27, [x9], #0x8\n"
+      "str d28, [x27], #0x8\n"
+      "str d29, [x25], #0x8\n"
+      "str d30, [x23], #0x8\n"
+      "tbz x6, #0, 152f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x11]\n"
+      "st1 { v27.s }[2], [x9]\n"
+      "st1 { v28.s }[2], [x27]\n"
+      "st1 { v29.s }[2], [x25]\n"
+      "st1 { v30.s }[2], [x23]\n"
+      "b 152f\n"
+      "151:"  // Height 7: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x11, #0x0]\n"
+      "str s27, [x9, #0x0]\n"
+      "str s28, [x27, #0x0]\n"
+      "str s29, [x25, #0x0]\n"
+      "str s30, [x23, #0x0]\n"
+      "152:"  // Height 7: Partial direct writeback: Done
+      "b 154f\n"
+      "153:"  // Height 7: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x11, #0x0]\n"
+      "str q27, [x9, #0x0]\n"
+      "str q28, [x27, #0x0]\n"
+      "str q29, [x25, #0x0]\n"
+      "str q30, [x23, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "154:"  // Height 7: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 135b\n"
+      "b 178f\n"
+      "155:"  // Height 8
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 156f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x28]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x23, [%x[output_ptr], #0x30]\n"
+      "ldr x21, [%x[output_ptr], #0x38]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add %x[output_ptr], %x[output_ptr], #0x40\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 157f\n"
+      "156:"  // Height 8: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "157:"  // Height 8: Column loop
+      "cbz x8, 158f\n"
+      "ldr q24, [x8, #0x0]\n"
+      "mov v25.16b, v24.16b\n"
+      "add x8, x8, #0x10\n"
+      "mov v26.16b, v24.16b\n"
+      "mov v27.16b, v24.16b\n"
+      "mov v28.16b, v24.16b\n"
+      "mov v29.16b, v24.16b\n"
+      "mov v30.16b, v24.16b\n"
+      "mov v31.16b, v24.16b\n"
+      "b 163f\n"
+      "158:"  // Height 8: no bias
+      "tbz %x[flags], #0, 162f\n"
+      "cmp x6, #0x4\n"
+      "bge 161f\n"
+      "tbz x6, #1, 159f\n"
+      "ldr d24, [x17], #0x8\n"
+      "ldr d25, [x13], #0x8\n"
+      "ldr d26, [x11], #0x8\n"
+      "ldr d27, [x9], #0x8\n"
+      "ldr d28, [x27], #0x8\n"
+      "ldr d29, [x25], #0x8\n"
+      "ldr d30, [x23], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x6, #0, 160f\n"
+      "ld1 { v24.s }[2], [x17]\n"
+      "ld1 { v25.s }[2], [x13]\n"
+      "ld1 { v26.s }[2], [x11]\n"
+      "ld1 { v27.s }[2], [x9]\n"
+      "ld1 { v28.s }[2], [x27]\n"
+      "ld1 { v29.s }[2], [x25]\n"
+      "ld1 { v30.s }[2], [x23]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 160f\n"
+      "159:"  // Height 8: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s24, [x17, #0x0]\n"
+      "ldr s25, [x13, #0x0]\n"
+      "ldr s26, [x11, #0x0]\n"
+      "ldr s27, [x9, #0x0]\n"
+      "ldr s28, [x27, #0x0]\n"
+      "ldr s29, [x25, #0x0]\n"
+      "ldr s30, [x23, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "160:"  // Height 8: Partial accumulate: Done
+      "sub x17, x17, x19\n"
+      "sub x13, x13, x19\n"
+      "sub x11, x11, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "sub x21, x21, x19\n"
+      "b 163f\n"
+      "161:"  // Height 8: full accumulate
+      "ldr q24, [x17, #0x0]\n"
+      "ldr q25, [x13, #0x0]\n"
+      "ldr q26, [x11, #0x0]\n"
+      "ldr q27, [x9, #0x0]\n"
+      "ldr q28, [x27, #0x0]\n"
+      "ldr q29, [x25, #0x0]\n"
+      "ldr q30, [x23, #0x0]\n"
+      "ldr q31, [x21, #0x0]\n"
+      "b 163f\n"
+      "162:"  // Height 8: no accumulate
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "163:"  // Height 8: setup done
+      "mov x16, #0x0\n"
+      "164:"  // Height 8: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 165f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "ldr x24, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "ldr x20, [x20, #0x38]\n"
+      "cbnz x16, 166f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 166f\n"
+      "165:"  // Height 8: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "add x20, x22, x19, LSL #2\n"
+      "166:"  // Height 8: input setup done
+      "cmp x15, #0x4\n"
+      "blt 169f\n"
+      "cmp x15, #0x8\n"
+      "blt 168f\n"
+      "167:"  // Height 8: Multiply loop: Main loop head
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q5, [x24, #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      "ldr q7, [x20, #0x0]\n"
+      "ldr q8, [x7, #0x0]\n"
+      "fmla v24.4s, v8.4s, v0.s[0]\n"
+      "ldr q9, [x7, #0x10]\n"
+      "fmla v25.4s, v8.4s, v1.s[0]\n"
+      "ldr q10, [x7, #0x20]\n"
+      "fmla v26.4s, v8.4s, v2.s[0]\n"
+      "ldr q11, [x7, #0x30]\n"
+      "fmla v27.4s, v8.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v8.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v29.4s, v8.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v30.4s, v8.4s, v6.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v31.4s, v8.4s, v7.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.4s, v9.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v25.4s, v9.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v26.4s, v9.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v27.4s, v9.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x15, x15, #0x4\n"
+      "fmla v28.4s, v9.4s, v4.s[1]\n"
+      "cmp x15, #0x8\n"
+      "fmla v29.4s, v9.4s, v5.s[1]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v30.4s, v9.4s, v6.s[1]\n"
+      "fmla v31.4s, v9.4s, v7.s[1]\n"
+      "fmla v24.4s, v10.4s, v0.s[2]\n"
+      "fmla v25.4s, v10.4s, v1.s[2]\n"
+      "fmla v26.4s, v10.4s, v2.s[2]\n"
+      "fmla v27.4s, v10.4s, v3.s[2]\n"
+      "fmla v28.4s, v10.4s, v4.s[2]\n"
+      "fmla v29.4s, v10.4s, v5.s[2]\n"
+      "fmla v30.4s, v10.4s, v6.s[2]\n"
+      "fmla v31.4s, v10.4s, v7.s[2]\n"
+      "fmla v24.4s, v11.4s, v0.s[3]\n"
+      "fmla v25.4s, v11.4s, v1.s[3]\n"
+      "fmla v26.4s, v11.4s, v2.s[3]\n"
+      "fmla v27.4s, v11.4s, v3.s[3]\n"
+      "fmla v28.4s, v11.4s, v4.s[3]\n"
+      "fmla v29.4s, v11.4s, v5.s[3]\n"
+      "fmla v30.4s, v11.4s, v6.s[3]\n"
+      "fmla v31.4s, v11.4s, v7.s[3]\n"
+      "bge 167b\n"
+      "168:"  // Height 8: Multiply loop: Single iteration only
+      "sub x15, x15, #0x4\n"
+      "ldr q0, [x14, #0x0]\n"
+      "ldr q1, [x12, #0x0]\n"
+      "ldr q2, [x10, #0x0]\n"
+      "ldr q3, [x28, #0x0]\n"
+      "ldr q4, [x26, #0x0]\n"
+      "ldr q5, [x24, #0x0]\n"
+      "ldr q6, [x22, #0x0]\n"
+      "ldr q7, [x20, #0x0]\n"
+      "ldr q12, [x7, #0x0]\n"
+      "fmla v24.4s, v12.4s, v0.s[0]\n"
+      "ldr q13, [x7, #0x10]\n"
+      "fmla v25.4s, v12.4s, v1.s[0]\n"
+      "ldr q14, [x7, #0x20]\n"
+      "fmla v26.4s, v12.4s, v2.s[0]\n"
+      "ldr q15, [x7, #0x30]\n"
+      "fmla v27.4s, v12.4s, v3.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "fmla v28.4s, v12.4s, v4.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "fmla v29.4s, v12.4s, v5.s[0]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v30.4s, v12.4s, v6.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla v31.4s, v12.4s, v7.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla v24.4s, v13.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v25.4s, v13.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v26.4s, v13.4s, v2.s[1]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v27.4s, v13.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "add x7, x7, #0x40\n"
+      "fmla v28.4s, v13.4s, v4.s[1]\n"
+      "fmla v29.4s, v13.4s, v5.s[1]\n"
+      "fmla v30.4s, v13.4s, v6.s[1]\n"
+      "fmla v31.4s, v13.4s, v7.s[1]\n"
+      "fmla v24.4s, v14.4s, v0.s[2]\n"
+      "fmla v25.4s, v14.4s, v1.s[2]\n"
+      "fmla v26.4s, v14.4s, v2.s[2]\n"
+      "fmla v27.4s, v14.4s, v3.s[2]\n"
+      "fmla v28.4s, v14.4s, v4.s[2]\n"
+      "fmla v29.4s, v14.4s, v5.s[2]\n"
+      "fmla v30.4s, v14.4s, v6.s[2]\n"
+      "fmla v31.4s, v14.4s, v7.s[2]\n"
+      "fmla v24.4s, v15.4s, v0.s[3]\n"
+      "fmla v25.4s, v15.4s, v1.s[3]\n"
+      "fmla v26.4s, v15.4s, v2.s[3]\n"
+      "fmla v27.4s, v15.4s, v3.s[3]\n"
+      "fmla v28.4s, v15.4s, v4.s[3]\n"
+      "fmla v29.4s, v15.4s, v5.s[3]\n"
+      "fmla v30.4s, v15.4s, v6.s[3]\n"
+      "fmla v31.4s, v15.4s, v7.s[3]\n"
+      "169:"  // Height 8: Multiply loop: Main loop skip
+      "cbz x15, 171f\n"
+      "170:"  // Height 8: Multiply loop: Odd block loop
+      "ldr s0, [x14], #0x4\n"
+      "ldr s1, [x12], #0x4\n"
+      "ldr s2, [x10], #0x4\n"
+      "ldr s3, [x28], #0x4\n"
+      "ldr s4, [x26], #0x4\n"
+      "ldr s5, [x24], #0x4\n"
+      "ldr s6, [x22], #0x4\n"
+      "ldr s7, [x20], #0x4\n"
+      "ldr q16, [x7, #0x0]\n"
+      "fmla v24.4s, v16.4s, v0.s[0]\n"
+      "sub x15, x15, #0x1\n"
+      "fmla v25.4s, v16.4s, v1.s[0]\n"
+      "add x7, x7, #0x10\n"
+      "fmla v26.4s, v16.4s, v2.s[0]\n"
+      "fmla v27.4s, v16.4s, v3.s[0]\n"
+      "fmla v28.4s, v16.4s, v4.s[0]\n"
+      "fmla v29.4s, v16.4s, v5.s[0]\n"
+      "fmla v30.4s, v16.4s, v6.s[0]\n"
+      "fmla v31.4s, v16.4s, v7.s[0]\n"
+      "cbnz x15, 170b\n"
+      "171:"  // Height 8: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x16, x16, #0x1\n"
+      "cmp x16, x19\n"
+      "bne 164b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 172f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v17.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v16.4s }, [x19]\n"
+      "fmin v24.4s, v24.4s, v16.4s\n"
+      "fmin v25.4s, v25.4s, v16.4s\n"
+      "fmin v26.4s, v26.4s, v16.4s\n"
+      "fmin v27.4s, v27.4s, v16.4s\n"
+      "fmax v24.4s, v24.4s, v17.4s\n"
+      "fmax v25.4s, v25.4s, v17.4s\n"
+      "fmax v26.4s, v26.4s, v17.4s\n"
+      "fmax v27.4s, v27.4s, v17.4s\n"
+      "fmin v28.4s, v28.4s, v16.4s\n"
+      "fmin v29.4s, v29.4s, v16.4s\n"
+      "fmin v30.4s, v30.4s, v16.4s\n"
+      "fmax v28.4s, v28.4s, v17.4s\n"
+      "fmax v29.4s, v29.4s, v17.4s\n"
+      "fmax v30.4s, v30.4s, v17.4s\n"
+      "fmin v31.4s, v31.4s, v16.4s\n"
+      "fmax v31.4s, v31.4s, v17.4s\n"
+      "172:"  // Height 8: No activation
+      "cmp x6, #0x4\n"
+      "bge 175f\n"
+      "tbz x6, #1, 173f\n"
+      "str d24, [x17], #0x8\n"
+      "str d25, [x13], #0x8\n"
+      "str d26, [x11], #0x8\n"
+      "str d27, [x9], #0x8\n"
+      "str d28, [x27], #0x8\n"
+      "str d29, [x25], #0x8\n"
+      "str d30, [x23], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x6, #0, 174f\n"
+      "st1 { v24.s }[2], [x17]\n"
+      "st1 { v25.s }[2], [x13]\n"
+      "st1 { v26.s }[2], [x11]\n"
+      "st1 { v27.s }[2], [x9]\n"
+      "st1 { v28.s }[2], [x27]\n"
+      "st1 { v29.s }[2], [x25]\n"
+      "st1 { v30.s }[2], [x23]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 174f\n"
+      "173:"  // Height 8: Partial direct writeback: partial_1_0
+      "str s24, [x17, #0x0]\n"
+      "str s25, [x13, #0x0]\n"
+      "str s26, [x11, #0x0]\n"
+      "str s27, [x9, #0x0]\n"
+      "str s28, [x27, #0x0]\n"
+      "str s29, [x25, #0x0]\n"
+      "str s30, [x23, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "174:"  // Height 8: Partial direct writeback: Done
+      "b 176f\n"
+      "175:"  // Height 8: Full writeback
+      "str q24, [x17, #0x0]\n"
+      "str q25, [x13, #0x0]\n"
+      "str q26, [x11, #0x0]\n"
+      "str q27, [x9, #0x0]\n"
+      "str q28, [x27, #0x0]\n"
+      "str q29, [x25, #0x0]\n"
+      "str q30, [x23, #0x0]\n"
+      "str q31, [x21, #0x0]\n"
+      "add x17, x17, #0x10\n"
+      "add x13, x13, #0x10\n"
+      "add x11, x11, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x21, x21, #0x10\n"
+      "176:"  // Height 8: Writeback done
+      "subs x6, x6, #0x4\n"
+      "bgt 157b\n"
+      "subs %x[M], %x[M], #0x8\n"
+      "beq 178f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 177f\n"
+      "add x20, x20, #0x8\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "177:"  // Update direct input
+      "mov x19, #0x20\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "178:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
similarity index 60%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index a23101a7ce..4bb7a1e0eb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,38 +10,43 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
 
-#include <cstdint>
 #include "../std_transforms_fixed.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<int8_t>, \
+   size_t, size_t, \
+   const int8_t *, \
+   IndirectOutputArg<int8_t>, \
+   const Requantize32 *, const int32_t *, unsigned int
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_hybrid_s8s32_dot_16x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_hybrid_s8qa_dot_4x16( ARGLIST );
 
-class hybrid_s8s32_dot_16x4
+class cls_a64_hybrid_s8qa_dot_4x16
 {
 public:
     typedef int8_t operand_type;
-    typedef int32_t result_type;
+    typedef int8_t result_type;
 
-    typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
@@ -60,16 +65,6 @@ class hybrid_s8s32_dot_16x4
     }
 
     static constexpr bool supports_accumulate()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_activation()
     {
         return false;
     }
@@ -77,16 +72,14 @@ class hybrid_s8s32_dot_16x4
     StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_hybrid_s8s32_dot_16x4;
+    kern_type kernel=a64_hybrid_s8qa_dot_4x16;
 
-    hybrid_s8s32_dot_16x4(const CPUInfo *ci)
+    cls_a64_hybrid_s8qa_dot_4x16(const CPUInfo *)
     {
-        if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_hybrid_s8s32_dot_16x4_a55;
-        }
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
new file mode 100644
index 0000000000..3fb365bc1e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
@@ -0,0 +1,2072 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qa_dot_4x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 94f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 63f\n"
+      "beq 32f\n"
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "add x9, x9, x19\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "blt 12f\n"
+      "cmp x27, #0x20\n"
+      "blt 10f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      "ldr q6, [x11, #0x20]\n"
+      ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x30]\n"
+      ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q8, [x11, #0x40]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
+      ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
+      "ldr q5, [x11, #0x80]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      "tbnz %x[flags], #31, 9f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "9:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 8b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      "ldr q8, [x11, #0x20]\n"
+      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x30]\n"
+      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x40]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x4fa0e150  // sdot v16.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x4fa0e091  // sdot v17.4s, v4.16b, v0.4b[1]\n"
+      "ldr q7, [x11, #0x80]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x4fa0e0b2  // sdot v18.4s, v5.16b, v0.4b[1]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x4fa0e0d3  // sdot v19.4s, v6.16b, v0.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x4f80e8f0  // sdot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f80e911  // sdot v17.4s, v8.16b, v0.4b[2]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x4f80e932  // sdot v18.4s, v9.16b, v0.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f80e953  // sdot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e890  // sdot v16.4s, v4.16b, v0.4b[3]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4fa0e8b1  // sdot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8d2  // sdot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8f3  // sdot v19.4s, v7.16b, v0.4b[3]\n"
+      "tbnz %x[flags], #31, 11f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "11:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "12:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 19f\n"
+      "cmp x27, #0x4\n"
+      "blt 15f\n"
+      "13:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "tbnz %x[flags], #31, 14f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "14:"  // Height 1: Multiply loop: unique 3: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x4f80e110  // sdot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      "ldr q10, [x11, #0x20]\n"
+      ".inst 0x4f80e131  // sdot v17.4s, v9.16b, v0.4b[0]\n"
+      "ldr q4, [x11, #0x30]\n"
+      ".inst 0x4f80e152  // sdot v18.4s, v10.16b, v0.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      "cmp x27, #0x4\n"
+      "bge 13b\n"
+      "cbz x27, 19f\n"
+      "15:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 16f\n"
+      "ldr h0, [x26], #0x2\n"
+      "tbz x27, #0, 17f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "b 17f\n"
+      "16:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "17:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 18f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "18:"  // Height 1: Multiply loop: unique 4: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x4f80e0b0  // sdot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      "ldr q7, [x11, #0x20]\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      "ldr q8, [x11, #0x30]\n"
+      ".inst 0x4f80e0f2  // sdot v18.4s, v7.16b, v0.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f80e113  // sdot v19.4s, v8.16b, v0.4b[0]\n"
+      "19:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 5b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbnz %x[flags], #31, 20f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "neg v1.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v1.4s\n"
+      "20:"  // Height 1: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add x10, x10, #0x40\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "tbz %x[flags], #5, 21f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "21:"  // Height 1: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 30f\n"
+      "tbz x12, #3, 25f\n"
+      "str d16, [x9], #0x8\n"
+      "tbz x12, #2, 23f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "tbz x12, #1, 22f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "b 29f\n"
+      "22:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "b 29f\n"
+      "23:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 24f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "b 29f\n"
+      "24:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "b 29f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 27f\n"
+      "str s16, [x9], #0x4\n"
+      "tbz x12, #1, 26f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "b 29f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "b 29f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 28f\n"
+      "str h16, [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "b 29f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "29:"  // Height 1: Partial direct writeback: Done
+      "b 31f\n"
+      "30:"  // Height 1: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "31:"  // Height 1: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 3b\n"
+      "b 126f\n"
+      "32:"  // Height 2
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 33f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "add x25, x25, x19\n"
+      "b 34f\n"
+      "33:"  // Height 2: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "34:"  // Height 2: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "35:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "36:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 37f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x28, 38f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 38f\n"
+      "37:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "38:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "blt 43f\n"
+      "cmp x27, #0x20\n"
+      "blt 41f\n"
+      "39:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x20]\n"
+      "ldr q7, [x11, #0x30]\n"
+      ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q8, [x11, #0x40]\n"
+      ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x80]\n"
+      ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      "tbnz %x[flags], #31, 40f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      "40:"  // Height 2: Multiply loop: unique 5: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x20\n"
+      "bge 39b\n"
+      "41:"  // Height 2: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x20]\n"
+      "ldr q9, [x11, #0x30]\n"
+      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x40]\n"
+      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x80]\n"
+      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x4fa0e150  // sdot v16.4s, v10.16b, v0.4b[1]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x4fa1e154  // sdot v20.4s, v10.16b, v1.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      ".inst 0x4fa0e091  // sdot v17.4s, v4.16b, v0.4b[1]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4fa1e095  // sdot v21.4s, v4.16b, v1.4b[1]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x4fa0e0b2  // sdot v18.4s, v5.16b, v0.4b[1]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4fa1e0b6  // sdot v22.4s, v5.16b, v1.4b[1]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      ".inst 0x4fa0e0d3  // sdot v19.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0d7  // sdot v23.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x4f80e8f0  // sdot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8f4  // sdot v20.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      ".inst 0x4f80e911  // sdot v17.4s, v8.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4f81e915  // sdot v21.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x4f80e932  // sdot v18.4s, v9.16b, v0.4b[2]\n"
+      ".inst 0x4f81e936  // sdot v22.4s, v9.16b, v1.4b[2]\n"
+      ".inst 0x4f80e953  // sdot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x4f81e957  // sdot v23.4s, v10.16b, v1.4b[2]\n"
+      ".inst 0x4fa0e890  // sdot v16.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e894  // sdot v20.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8b1  // sdot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8b5  // sdot v21.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8d2  // sdot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8d6  // sdot v22.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8f3  // sdot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8f7  // sdot v23.4s, v7.16b, v1.4b[3]\n"
+      "tbnz %x[flags], #31, 42f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      "42:"  // Height 2: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "43:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 50f\n"
+      "cmp x27, #0x4\n"
+      "blt 46f\n"
+      "44:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "ldr s1, [x24], #0x4\n"
+      "tbnz %x[flags], #31, 45f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      "45:"  // Height 2: Multiply loop: unique 7: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x4f80e110  // sdot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      ".inst 0x4f81e114  // sdot v20.4s, v8.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x20]\n"
+      "ldr q4, [x11, #0x30]\n"
+      ".inst 0x4f80e131  // sdot v17.4s, v9.16b, v0.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x4f81e135  // sdot v21.4s, v9.16b, v1.4b[0]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x4f80e152  // sdot v18.4s, v10.16b, v0.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f81e156  // sdot v22.4s, v10.16b, v1.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      "bge 44b\n"
+      "cbz x27, 50f\n"
+      "46:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 47f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x24], #0x2\n"
+      "tbz x27, #0, 48f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "b 48f\n"
+      "47:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x24, #0x0]\n"
+      "48:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 49f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      "49:"  // Height 2: Multiply loop: unique 8: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x4f80e0b0  // sdot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x4f81e0b4  // sdot v20.4s, v5.16b, v1.4b[0]\n"
+      "ldr q7, [x11, #0x20]\n"
+      "ldr q8, [x11, #0x30]\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e0f2  // sdot v18.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0f6  // sdot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f80e113  // sdot v19.4s, v8.16b, v0.4b[0]\n"
+      ".inst 0x4f81e117  // sdot v23.4s, v8.16b, v1.4b[0]\n"
+      "50:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 36b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbnz %x[flags], #31, 51f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v2.4s }, [x19]\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "neg v2.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v2.4s\n"
+      "mul v12.4s, v12.4s, v2.4s\n"
+      "51:"  // Height 2: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "add v20.4s, v20.4s, v12.4s\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v22.4s, v22.4s, v12.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add v23.4s, v23.4s, v12.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "tbz %x[flags], #5, 52f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "and v8.16b, v20.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v9.16b, v21.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v8.4s\n"
+      "sqadd v21.4s, v21.4s, v9.4s\n"
+      "sqadd v22.4s, v22.4s, v10.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "52:"  // Height 2: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "bge 61f\n"
+      "tbz x12, #3, 56f\n"
+      "str d16, [x9], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x12, #2, 54f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "tbz x12, #1, 53f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "b 60f\n"
+      "53:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "b 60f\n"
+      "54:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 55f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "b 60f\n"
+      "55:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "b 60f\n"
+      "56:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 58f\n"
+      "str s16, [x9], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "tbz x12, #1, 57f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "b 60f\n"
+      "57:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "b 60f\n"
+      "58:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 59f\n"
+      "str h16, [x9], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "b 60f\n"
+      "59:"  // Height 2: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "60:"  // Height 2: Partial direct writeback: Done
+      "b 62f\n"
+      "61:"  // Height 2: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "62:"  // Height 2: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 34b\n"
+      "b 126f\n"
+      "63:"  // Height 3
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 64f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "b 65f\n"
+      "64:"  // Height 3: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "65:"  // Height 3: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "66:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "67:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 68f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x28, 69f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 69f\n"
+      "68:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "69:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "blt 74f\n"
+      "cmp x27, #0x20\n"
+      "blt 72f\n"
+      "70:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x20]\n"
+      ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
+      "ldr q7, [x11, #0x30]\n"
+      "ldr q8, [x11, #0x40]\n"
+      ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x80]\n"
+      ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e938  // sdot v24.4s, v9.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e959  // sdot v25.4s, v10.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e89a  // sdot v26.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8bb  // sdot v27.4s, v5.16b, v2.4b[3]\n"
+      "tbnz %x[flags], #31, 71f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "71:"  // Height 3: Multiply loop: unique 9: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "bge 70b\n"
+      "72:"  // Height 3: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x20]\n"
+      ".inst 0x4f82e0d8  // sdot v24.4s, v6.16b, v2.4b[0]\n"
+      "ldr q9, [x11, #0x30]\n"
+      "ldr q10, [x11, #0x40]\n"
+      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      ".inst 0x4f82e0f9  // sdot v25.4s, v7.16b, v2.4b[0]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x80]\n"
+      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f82e11a  // sdot v26.4s, v8.16b, v2.4b[0]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e13b  // sdot v27.4s, v9.16b, v2.4b[0]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x4fa0e150  // sdot v16.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e154  // sdot v20.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e158  // sdot v24.4s, v10.16b, v2.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      ".inst 0x4fa0e091  // sdot v17.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e095  // sdot v21.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e099  // sdot v25.4s, v4.16b, v2.4b[1]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x4fa0e0b2  // sdot v18.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0b6  // sdot v22.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0ba  // sdot v26.4s, v5.16b, v2.4b[1]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      ".inst 0x4fa0e0d3  // sdot v19.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0d7  // sdot v23.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0db  // sdot v27.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x4f80e8f0  // sdot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8f4  // sdot v20.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f8  // sdot v24.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      ".inst 0x4f80e911  // sdot v17.4s, v8.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4f81e915  // sdot v21.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x4f82e919  // sdot v25.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x4f80e932  // sdot v18.4s, v9.16b, v0.4b[2]\n"
+      ".inst 0x4f81e936  // sdot v22.4s, v9.16b, v1.4b[2]\n"
+      ".inst 0x4f82e93a  // sdot v26.4s, v9.16b, v2.4b[2]\n"
+      ".inst 0x4f80e953  // sdot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x4f81e957  // sdot v23.4s, v10.16b, v1.4b[2]\n"
+      ".inst 0x4f82e95b  // sdot v27.4s, v10.16b, v2.4b[2]\n"
+      ".inst 0x4fa0e890  // sdot v16.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e894  // sdot v20.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e898  // sdot v24.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8b1  // sdot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8b5  // sdot v21.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8b9  // sdot v25.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8d2  // sdot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8d6  // sdot v22.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8da  // sdot v26.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8f3  // sdot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8f7  // sdot v23.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8fb  // sdot v27.4s, v7.16b, v2.4b[3]\n"
+      "tbnz %x[flags], #31, 73f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "73:"  // Height 3: Multiply loop: unique 10: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "74:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 81f\n"
+      "cmp x27, #0x4\n"
+      "blt 77f\n"
+      "75:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "tbnz %x[flags], #31, 76f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "76:"  // Height 3: Multiply loop: unique 11: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x4f80e110  // sdot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      ".inst 0x4f81e114  // sdot v20.4s, v8.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x20]\n"
+      ".inst 0x4f82e118  // sdot v24.4s, v8.16b, v2.4b[0]\n"
+      "ldr q4, [x11, #0x30]\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x4f80e131  // sdot v17.4s, v9.16b, v0.4b[0]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x4f81e135  // sdot v21.4s, v9.16b, v1.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f82e139  // sdot v25.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x4f80e152  // sdot v18.4s, v10.16b, v0.4b[0]\n"
+      ".inst 0x4f81e156  // sdot v22.4s, v10.16b, v1.4b[0]\n"
+      ".inst 0x4f82e15a  // sdot v26.4s, v10.16b, v2.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4f82e09b  // sdot v27.4s, v4.16b, v2.4b[0]\n"
+      "bge 75b\n"
+      "cbz x27, 81f\n"
+      "77:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 78f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "tbz x27, #0, 79f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "b 79f\n"
+      "78:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "79:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 80f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "80:"  // Height 3: Multiply loop: unique 12: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x4f80e0b0  // sdot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x4f81e0b4  // sdot v20.4s, v5.16b, v1.4b[0]\n"
+      "ldr q7, [x11, #0x20]\n"
+      ".inst 0x4f82e0b8  // sdot v24.4s, v5.16b, v2.4b[0]\n"
+      "ldr q8, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d9  // sdot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f80e0f2  // sdot v18.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0f6  // sdot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0fa  // sdot v26.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f80e113  // sdot v19.4s, v8.16b, v0.4b[0]\n"
+      ".inst 0x4f81e117  // sdot v23.4s, v8.16b, v1.4b[0]\n"
+      ".inst 0x4f82e11b  // sdot v27.4s, v8.16b, v2.4b[0]\n"
+      "81:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 67b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbnz %x[flags], #31, 82f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v3.4s }, [x19]\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "neg v3.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v3.4s\n"
+      "mul v12.4s, v12.4s, v3.4s\n"
+      "mul v13.4s, v13.4s, v3.4s\n"
+      "82:"  // Height 3: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "add v20.4s, v20.4s, v12.4s\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v22.4s, v22.4s, v12.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add v23.4s, v23.4s, v12.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v24.4s, v24.4s, v13.4s\n"
+      "add v25.4s, v25.4s, v13.4s\n"
+      "add v26.4s, v26.4s, v13.4s\n"
+      "add v27.4s, v27.4s, v13.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "tbz %x[flags], #5, 83f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "and v8.16b, v20.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v9.16b, v21.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v8.4s\n"
+      "sqadd v21.4s, v21.4s, v9.4s\n"
+      "sqadd v22.4s, v22.4s, v10.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v7.16b, v26.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v8.16b, v27.16b, v0.16b\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v26.4s, v26.4s, v7.4s\n"
+      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "83:"  // Height 3: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "bge 92f\n"
+      "tbz x12, #3, 87f\n"
+      "str d16, [x9], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x12, #2, 85f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "tbz x12, #1, 84f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "st1 { v24.h }[6], [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "st1 { v24.b }[14], [x23]\n"
+      "b 91f\n"
+      "84:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "st1 { v24.b }[12], [x23]\n"
+      "b 91f\n"
+      "85:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 86f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "st1 { v24.h }[4], [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "st1 { v24.b }[10], [x23]\n"
+      "b 91f\n"
+      "86:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "st1 { v24.b }[8], [x23]\n"
+      "b 91f\n"
+      "87:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 89f\n"
+      "str s16, [x9], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "tbz x12, #1, 88f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "st1 { v24.h }[2], [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "st1 { v24.b }[6], [x23]\n"
+      "b 91f\n"
+      "88:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "st1 { v24.b }[4], [x23]\n"
+      "b 91f\n"
+      "89:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 90f\n"
+      "str h16, [x9], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "str h24, [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "st1 { v24.b }[2], [x23]\n"
+      "b 91f\n"
+      "90:"  // Height 3: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "str b24, [x23, #0x0]\n"
+      "91:"  // Height 3: Partial direct writeback: Done
+      "b 93f\n"
+      "92:"  // Height 3: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q24, [x23, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "93:"  // Height 3: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 65b\n"
+      "b 126f\n"
+      "94:"  // Height 4
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 95f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "ldr x21, [%x[output_ptr], #0x18]\n"
+      "add x25, x25, x19\n"
+      "add %x[output_ptr], %x[output_ptr], #0x20\n"
+      "add x23, x23, x19\n"
+      "add x21, x21, x19\n"
+      "b 96f\n"
+      "95:"  // Height 4: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "add x21, x23, x19\n"
+      "add %x[output_ptr], x21, x19\n"
+      "96:"  // Height 4: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "97:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "98:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 99f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x28, 100f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 100f\n"
+      "99:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "100:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "blt 105f\n"
+      "cmp x27, #0x20\n"
+      "blt 103f\n"
+      "101:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x20, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x4f80e090  // sdot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x20]\n"
+      ".inst 0x4f82e098  // sdot v24.4s, v4.16b, v2.4b[0]\n"
+      "ldr q7, [x11, #0x30]\n"
+      ".inst 0x4f83e09c  // sdot v28.4s, v4.16b, v3.4b[0]\n"
+      "ldr q8, [x11, #0x40]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x4f80e0b1  // sdot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x4f82e0b9  // sdot v25.4s, v5.16b, v2.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f83e0bd  // sdot v29.4s, v5.16b, v3.4b[0]\n"
+      "ldr q5, [x11, #0x80]\n"
+      ".inst 0x4f80e0d2  // sdot v18.4s, v6.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e0da  // sdot v26.4s, v6.16b, v2.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f83e0de  // sdot v30.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x4f80e0f3  // sdot v19.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0f7  // sdot v23.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0fb  // sdot v27.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0ff  // sdot v31.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x4fa0e110  // sdot v16.4s, v8.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e114  // sdot v20.4s, v8.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e118  // sdot v24.4s, v8.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e11c  // sdot v28.4s, v8.16b, v3.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      ".inst 0x4fa0e131  // sdot v17.4s, v9.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e135  // sdot v21.4s, v9.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e139  // sdot v25.4s, v9.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e13d  // sdot v29.4s, v9.16b, v3.4b[1]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x4fa0e152  // sdot v18.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e156  // sdot v22.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e15a  // sdot v26.4s, v10.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e15e  // sdot v30.4s, v10.16b, v3.4b[1]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      ".inst 0x4fa0e093  // sdot v19.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e09b  // sdot v27.4s, v4.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e09f  // sdot v31.4s, v4.16b, v3.4b[1]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8b4  // sdot v20.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8b8  // sdot v24.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8bc  // sdot v28.4s, v5.16b, v3.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      ".inst 0x4f80e8d1  // sdot v17.4s, v6.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4f81e8d5  // sdot v21.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d9  // sdot v25.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8dd  // sdot v29.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f80e8f2  // sdot v18.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8f6  // sdot v22.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8fa  // sdot v26.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8fe  // sdot v30.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f80e913  // sdot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x4f81e917  // sdot v23.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x4f82e91b  // sdot v27.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x4f83e91f  // sdot v31.4s, v8.16b, v3.4b[2]\n"
+      ".inst 0x4fa0e930  // sdot v16.4s, v9.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e934  // sdot v20.4s, v9.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e938  // sdot v24.4s, v9.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e93c  // sdot v28.4s, v9.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e951  // sdot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e955  // sdot v21.4s, v10.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e959  // sdot v25.4s, v10.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e95d  // sdot v29.4s, v10.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e896  // sdot v22.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e89a  // sdot v26.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e89e  // sdot v30.4s, v4.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8b7  // sdot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8bb  // sdot v27.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8bf  // sdot v31.4s, v5.16b, v3.4b[3]\n"
+      "tbnz %x[flags], #31, 102f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
+      "102:"  // Height 4: Multiply loop: unique 13: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bge 101b\n"
+      "103:"  // Height 4: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x20, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x4f80e0d0  // sdot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x4f81e0d4  // sdot v20.4s, v6.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x20]\n"
+      ".inst 0x4f82e0d8  // sdot v24.4s, v6.16b, v2.4b[0]\n"
+      "ldr q9, [x11, #0x30]\n"
+      ".inst 0x4f83e0dc  // sdot v28.4s, v6.16b, v3.4b[0]\n"
+      "ldr q10, [x11, #0x40]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x4f80e0f1  // sdot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      ".inst 0x4f81e0f5  // sdot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x4f82e0f9  // sdot v25.4s, v7.16b, v2.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f83e0fd  // sdot v29.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x11, #0x80]\n"
+      ".inst 0x4f80e112  // sdot v18.4s, v8.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f81e116  // sdot v22.4s, v8.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e11a  // sdot v26.4s, v8.16b, v2.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f83e11e  // sdot v30.4s, v8.16b, v3.4b[0]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x4f80e133  // sdot v19.4s, v9.16b, v0.4b[0]\n"
+      ".inst 0x4f81e137  // sdot v23.4s, v9.16b, v1.4b[0]\n"
+      ".inst 0x4f82e13b  // sdot v27.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x4f83e13f  // sdot v31.4s, v9.16b, v3.4b[0]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x4fa0e150  // sdot v16.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e154  // sdot v20.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e158  // sdot v24.4s, v10.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e15c  // sdot v28.4s, v10.16b, v3.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      ".inst 0x4fa0e091  // sdot v17.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e095  // sdot v21.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e099  // sdot v25.4s, v4.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e09d  // sdot v29.4s, v4.16b, v3.4b[1]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x4fa0e0b2  // sdot v18.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0b6  // sdot v22.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0ba  // sdot v26.4s, v5.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0be  // sdot v30.4s, v5.16b, v3.4b[1]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      ".inst 0x4fa0e0d3  // sdot v19.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0d7  // sdot v23.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0db  // sdot v27.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0df  // sdot v31.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x4f80e8f0  // sdot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8f4  // sdot v20.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f8  // sdot v24.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8fc  // sdot v28.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      ".inst 0x4f80e911  // sdot v17.4s, v8.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x4f81e915  // sdot v21.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x4f82e919  // sdot v25.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x4f83e91d  // sdot v29.4s, v8.16b, v3.4b[2]\n"
+      ".inst 0x4f80e932  // sdot v18.4s, v9.16b, v0.4b[2]\n"
+      ".inst 0x4f81e936  // sdot v22.4s, v9.16b, v1.4b[2]\n"
+      ".inst 0x4f82e93a  // sdot v26.4s, v9.16b, v2.4b[2]\n"
+      ".inst 0x4f83e93e  // sdot v30.4s, v9.16b, v3.4b[2]\n"
+      ".inst 0x4f80e953  // sdot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x4f81e957  // sdot v23.4s, v10.16b, v1.4b[2]\n"
+      ".inst 0x4f82e95b  // sdot v27.4s, v10.16b, v2.4b[2]\n"
+      ".inst 0x4f83e95f  // sdot v31.4s, v10.16b, v3.4b[2]\n"
+      ".inst 0x4fa0e890  // sdot v16.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e894  // sdot v20.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e898  // sdot v24.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e89c  // sdot v28.4s, v4.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8b1  // sdot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8b5  // sdot v21.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8b9  // sdot v25.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8bd  // sdot v29.4s, v5.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8d2  // sdot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8d6  // sdot v22.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8da  // sdot v26.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8de  // sdot v30.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8f3  // sdot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8f7  // sdot v23.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8fb  // sdot v27.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8ff  // sdot v31.4s, v7.16b, v3.4b[3]\n"
+      "tbnz %x[flags], #31, 104f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
+      "104:"  // Height 4: Multiply loop: unique 14: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "105:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 112f\n"
+      "cmp x27, #0x4\n"
+      "blt 108f\n"
+      "106:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "ldr s3, [x20], #0x4\n"
+      "tbnz %x[flags], #31, 107f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
+      "107:"  // Height 4: Multiply loop: unique 15: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x4f80e110  // sdot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      ".inst 0x4f81e114  // sdot v20.4s, v8.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x20]\n"
+      ".inst 0x4f82e118  // sdot v24.4s, v8.16b, v2.4b[0]\n"
+      "ldr q4, [x11, #0x30]\n"
+      ".inst 0x4f83e11c  // sdot v28.4s, v8.16b, v3.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f80e131  // sdot v17.4s, v9.16b, v0.4b[0]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x4f81e135  // sdot v21.4s, v9.16b, v1.4b[0]\n"
+      ".inst 0x4f82e139  // sdot v25.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x4f83e13d  // sdot v29.4s, v9.16b, v3.4b[0]\n"
+      ".inst 0x4f80e152  // sdot v18.4s, v10.16b, v0.4b[0]\n"
+      ".inst 0x4f81e156  // sdot v22.4s, v10.16b, v1.4b[0]\n"
+      ".inst 0x4f82e15a  // sdot v26.4s, v10.16b, v2.4b[0]\n"
+      ".inst 0x4f83e15e  // sdot v30.4s, v10.16b, v3.4b[0]\n"
+      ".inst 0x4f80e093  // sdot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4f81e097  // sdot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4f82e09b  // sdot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x4f83e09f  // sdot v31.4s, v4.16b, v3.4b[0]\n"
+      "bge 106b\n"
+      "cbz x27, 112f\n"
+      "108:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 109f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "ldr h3, [x20], #0x2\n"
+      "tbz x27, #0, 110f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "ld1 { v3.b }[2], [x20]\n"
+      "b 110f\n"
+      "109:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "ldr b3, [x20, #0x0]\n"
+      "110:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 111f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942c  // sdot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f946e  // sdot v14.4s, v3.16b, v15.16b\n"
+      "111:"  // Height 4: Multiply loop: unique 16: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x4f80e0b0  // sdot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x4f81e0b4  // sdot v20.4s, v5.16b, v1.4b[0]\n"
+      "ldr q7, [x11, #0x20]\n"
+      ".inst 0x4f82e0b8  // sdot v24.4s, v5.16b, v2.4b[0]\n"
+      "ldr q8, [x11, #0x30]\n"
+      ".inst 0x4f83e0bc  // sdot v28.4s, v5.16b, v3.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x4f80e0d1  // sdot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d9  // sdot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0dd  // sdot v29.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0f2  // sdot v18.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0f6  // sdot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0fa  // sdot v26.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0fe  // sdot v30.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f80e113  // sdot v19.4s, v8.16b, v0.4b[0]\n"
+      ".inst 0x4f81e117  // sdot v23.4s, v8.16b, v1.4b[0]\n"
+      ".inst 0x4f82e11b  // sdot v27.4s, v8.16b, v2.4b[0]\n"
+      ".inst 0x4f83e11f  // sdot v31.4s, v8.16b, v3.4b[0]\n"
+      "112:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 98b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbnz %x[flags], #31, 113f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "addp v14.4s, v14.4s, v14.4s\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "addp v14.4s, v14.4s, v14.4s\n"
+      "neg v4.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v4.4s\n"
+      "mul v12.4s, v12.4s, v4.4s\n"
+      "mul v13.4s, v13.4s, v4.4s\n"
+      "mul v14.4s, v14.4s, v4.4s\n"
+      "113:"  // Height 4: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "add v20.4s, v20.4s, v12.4s\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v22.4s, v22.4s, v12.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add v23.4s, v23.4s, v12.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v24.4s, v24.4s, v13.4s\n"
+      "add v25.4s, v25.4s, v13.4s\n"
+      "add v26.4s, v26.4s, v13.4s\n"
+      "add v27.4s, v27.4s, v13.4s\n"
+      "add v28.4s, v28.4s, v14.4s\n"
+      "add v29.4s, v29.4s, v14.4s\n"
+      "add v30.4s, v30.4s, v14.4s\n"
+      "add v31.4s, v31.4s, v14.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v29.4s, v29.4s, v1.4s\n"
+      "add v30.4s, v30.4s, v2.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "tbz %x[flags], #5, 114f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "and v8.16b, v20.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v9.16b, v21.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v8.4s\n"
+      "sqadd v21.4s, v21.4s, v9.4s\n"
+      "sqadd v22.4s, v22.4s, v10.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v7.16b, v26.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v8.16b, v27.16b, v0.16b\n"
+      "and v9.16b, v28.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "and v10.16b, v29.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "and v4.16b, v30.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v26.4s, v26.4s, v7.4s\n"
+      "and v5.16b, v31.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v9.4s\n"
+      "sqadd v29.4s, v29.4s, v10.4s\n"
+      "sqadd v30.4s, v30.4s, v4.4s\n"
+      "sqadd v31.4s, v31.4s, v5.4s\n"
+      "114:"  // Height 4: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "srshl v29.4s, v29.4s, v0.4s\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "srshl v30.4s, v30.4s, v0.4s\n"
+      "smin v28.4s, v28.4s, v6.4s\n"
+      "smin v29.4s, v29.4s, v6.4s\n"
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "smax v28.4s, v28.4s, v5.4s\n"
+      "smax v29.4s, v29.4s, v5.4s\n"
+      "add v30.4s, v30.4s, v4.4s\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v30.4s, v30.4s, v6.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smax v30.4s, v30.4s, v5.4s\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v28.8h, v28.8h, v29.8h\n"
+      "uzp1 v29.8h, v30.8h, v31.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "bge 123f\n"
+      "tbz x12, #3, 118f\n"
+      "str d16, [x9], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x12, #2, 116f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x12, #1, 115f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "st1 { v24.h }[6], [x23], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "st1 { v24.b }[14], [x23]\n"
+      "st1 { v28.b }[14], [x21]\n"
+      "b 122f\n"
+      "115:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "st1 { v24.b }[12], [x23]\n"
+      "st1 { v28.b }[12], [x21]\n"
+      "b 122f\n"
+      "116:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 117f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "st1 { v24.h }[4], [x23], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "st1 { v24.b }[10], [x23]\n"
+      "st1 { v28.b }[10], [x21]\n"
+      "b 122f\n"
+      "117:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "st1 { v24.b }[8], [x23]\n"
+      "st1 { v28.b }[8], [x21]\n"
+      "b 122f\n"
+      "118:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 120f\n"
+      "str s16, [x9], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x12, #1, 119f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "st1 { v24.h }[2], [x23], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "st1 { v24.b }[6], [x23]\n"
+      "st1 { v28.b }[6], [x21]\n"
+      "b 122f\n"
+      "119:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "st1 { v24.b }[4], [x23]\n"
+      "st1 { v28.b }[4], [x21]\n"
+      "b 122f\n"
+      "120:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 121f\n"
+      "str h16, [x9], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "str h24, [x23], #0x2\n"
+      "str h28, [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "st1 { v24.b }[2], [x23]\n"
+      "st1 { v28.b }[2], [x21]\n"
+      "b 122f\n"
+      "121:"  // Height 4: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "str b24, [x23, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
+      "122:"  // Height 4: Partial direct writeback: Done
+      "b 124f\n"
+      "123:"  // Height 4: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x21, x21, #0x10\n"
+      "124:"  // Height 4: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 96b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 126f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 125f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "125:"  // Update direct input
+      "mov x19, #0x4\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "126:"  // Exit
+
+      : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
new file mode 100644
index 0000000000..6d4f3b2efe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<int8_t>, \
+   size_t, size_t, \
+   const int8_t *, \
+   IndirectOutputArg<int8_t>, \
+   const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_s8qs_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_s8qs_dot_6x16
+{
+public:
+    typedef int8_t operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_s8qs_dot_6x16;
+
+    cls_a64_hybrid_s8qs_dot_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..0e98ab8347
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
@@ -0,0 +1,3613 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qs_dot_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+    struct KernelArgs {
+        const int32_t *multiplier_ptr = {};
+        const int32_t *shift_ptr = {};
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->per_channel_requant) {
+        flags |= 0x10;
+        ka.multiplier_ptr=qp->per_channel_muls + col_base;
+        ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+    }
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 141f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 113f\n"
+      "beq 85f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 57f\n"
+      "beq 29f\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x11, #0x10\n"
+      "blt 10f\n"
+      "cmp x11, #0x20\n"
+      "blt 9f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "bge 8b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "10:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 15f\n"
+      "cmp x11, #0x4\n"
+      "blt 12f\n"
+      "11:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "cmp x11, #0x4\n"
+      "bge 11b\n"
+      "cbz x11, 15f\n"
+      "12:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 13f\n"
+      "ldr h0, [x10], #0x2\n"
+      "tbz x11, #0, 14f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "b 14f\n"
+      "13:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "14:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "15:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 5b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "ldr q0, [x16, #0x0]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "ldr q1, [x16, #0x10]\n"
+      "ldr q2, [x16, #0x20]\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "ldr q3, [x16, #0x30]\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add x16, x16, #0x40\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "tbz %x[flags], #4, 16f\n"
+      "ldr q0, [x17, #0x0]\n"
+      "ldr q4, [x8, #0x0]\n"
+      "ldr q1, [x17, #0x10]\n"
+      "ldr q5, [x8, #0x10]\n"
+      "ldr q2, [x17, #0x20]\n"
+      "ldr q6, [x8, #0x20]\n"
+      "ldr q3, [x17, #0x30]\n"
+      "ldr q7, [x8, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "add x8, x8, #0x40\n"
+      "b 17f\n"
+      "16:"  // Height 1: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "17:"  // Height 1: parameters loaded
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "tbz %x[flags], #5, 18f\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "18:"  // Height 1: no shift correction
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x15, #0x10\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "bge 27f\n"
+      "tbz x15, #3, 22f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x15, #2, 20f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "tbz x15, #1, 19f\n"
+      "st1 { v8.h }[6], [x13], #0x2\n"
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[14], [x13]\n"
+      "b 26f\n"
+      "19:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[12], [x13]\n"
+      "b 26f\n"
+      "20:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 21f\n"
+      "st1 { v8.h }[4], [x13], #0x2\n"
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[10], [x13]\n"
+      "b 26f\n"
+      "21:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[8], [x13]\n"
+      "b 26f\n"
+      "22:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 24f\n"
+      "str s8, [x13], #0x4\n"
+      "tbz x15, #1, 23f\n"
+      "st1 { v8.h }[2], [x13], #0x2\n"
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[6], [x13]\n"
+      "b 26f\n"
+      "23:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[4], [x13]\n"
+      "b 26f\n"
+      "24:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 25f\n"
+      "str h8, [x13], #0x2\n"
+      "tbz x15, #0, 26f\n"
+      "st1 { v8.b }[2], [x13]\n"
+      "b 26f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_1_0
+      "str b8, [x13, #0x0]\n"
+      "26:"  // Height 1: Partial direct writeback: Done
+      "b 28f\n"
+      "27:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "28:"  // Height 1: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 3b\n"
+      "b 170f\n"
+      "29:"  // Height 2
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 30f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "b 31f\n"
+      "30:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "31:"  // Height 2: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "32:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "33:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 34f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 35f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "b 35f\n"
+      "34:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "35:"  // Height 2: input setup done
+      "cmp x11, #0x10\n"
+      "blt 38f\n"
+      "cmp x11, #0x20\n"
+      "blt 37f\n"
+      "36:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      "bge 36b\n"
+      "37:"  // Height 2: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      "38:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 43f\n"
+      "cmp x11, #0x4\n"
+      "blt 40f\n"
+      "39:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "bge 39b\n"
+      "cbz x11, 43f\n"
+      "40:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 41f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "tbz x11, #0, 42f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "b 42f\n"
+      "41:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "42:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "43:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 33b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "ldr q0, [x16, #0x0]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "ldr q1, [x16, #0x10]\n"
+      "add v12.4s, v12.4s, v0.4s\n"
+      "ldr q2, [x16, #0x20]\n"
+      "ldr q3, [x16, #0x30]\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add x16, x16, #0x40\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v14.4s, v14.4s, v2.4s\n"
+      "add v15.4s, v15.4s, v3.4s\n"
+      "tbz %x[flags], #4, 44f\n"
+      "ldr q0, [x17, #0x0]\n"
+      "ldr q4, [x8, #0x0]\n"
+      "ldr q1, [x17, #0x10]\n"
+      "ldr q5, [x8, #0x10]\n"
+      "ldr q2, [x17, #0x20]\n"
+      "ldr q6, [x8, #0x20]\n"
+      "ldr q3, [x17, #0x30]\n"
+      "ldr q7, [x8, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "add x8, x8, #0x40\n"
+      "b 45f\n"
+      "44:"  // Height 2: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "45:"  // Height 2: parameters loaded
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+      "tbz %x[flags], #5, 46f\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v4.16b, v12.16b, v0.16b\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v4.4s\n"
+      "and v7.16b, v15.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "sqadd v14.4s, v14.4s, v6.4s\n"
+      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "46:"  // Height 2: no shift correction
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x15, #0x10\n"
+      "srshl v12.4s, v12.4s, v0.4s\n"
+      "srshl v13.4s, v13.4s, v1.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "uzp1 v12.8h, v12.8h, v13.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "bge 55f\n"
+      "tbz x15, #3, 50f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "tbz x15, #2, 48f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "tbz x15, #1, 47f\n"
+      "st1 { v8.h }[6], [x13], #0x2\n"
+      "st1 { v12.h }[6], [x9], #0x2\n"
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[14], [x13]\n"
+      "st1 { v12.b }[14], [x9]\n"
+      "b 54f\n"
+      "47:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[12], [x13]\n"
+      "st1 { v12.b }[12], [x9]\n"
+      "b 54f\n"
+      "48:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 49f\n"
+      "st1 { v8.h }[4], [x13], #0x2\n"
+      "st1 { v12.h }[4], [x9], #0x2\n"
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[10], [x13]\n"
+      "st1 { v12.b }[10], [x9]\n"
+      "b 54f\n"
+      "49:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[8], [x13]\n"
+      "st1 { v12.b }[8], [x9]\n"
+      "b 54f\n"
+      "50:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 52f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "tbz x15, #1, 51f\n"
+      "st1 { v8.h }[2], [x13], #0x2\n"
+      "st1 { v12.h }[2], [x9], #0x2\n"
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[6], [x13]\n"
+      "st1 { v12.b }[6], [x9]\n"
+      "b 54f\n"
+      "51:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[4], [x13]\n"
+      "st1 { v12.b }[4], [x9]\n"
+      "b 54f\n"
+      "52:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 53f\n"
+      "str h8, [x13], #0x2\n"
+      "str h12, [x9], #0x2\n"
+      "tbz x15, #0, 54f\n"
+      "st1 { v8.b }[2], [x13]\n"
+      "st1 { v12.b }[2], [x9]\n"
+      "b 54f\n"
+      "53:"  // Height 2: Partial direct writeback: partial_1_0
+      "str b8, [x13, #0x0]\n"
+      "str b12, [x9, #0x0]\n"
+      "54:"  // Height 2: Partial direct writeback: Done
+      "b 56f\n"
+      "55:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q12, [x9, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "56:"  // Height 2: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 31b\n"
+      "b 170f\n"
+      "57:"  // Height 3
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 58f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "add x27, x27, x19\n"
+      "b 59f\n"
+      "58:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "59:"  // Height 3: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "60:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "61:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 62f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 63f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "b 63f\n"
+      "62:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "63:"  // Height 3: input setup done
+      "cmp x11, #0x10\n"
+      "blt 66f\n"
+      "cmp x11, #0x20\n"
+      "blt 65f\n"
+      "64:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      "bge 64b\n"
+      "65:"  // Height 3: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      "66:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 71f\n"
+      "cmp x11, #0x4\n"
+      "blt 68f\n"
+      "67:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "bge 67b\n"
+      "cbz x11, 71f\n"
+      "68:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 69f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "tbz x11, #0, 70f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "b 70f\n"
+      "69:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "70:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "71:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 61b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "ldr q0, [x16, #0x0]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "ldr q1, [x16, #0x10]\n"
+      "add v12.4s, v12.4s, v0.4s\n"
+      "ldr q2, [x16, #0x20]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "ldr q3, [x16, #0x30]\n"
+      "add x16, x16, #0x40\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v14.4s, v14.4s, v2.4s\n"
+      "add v15.4s, v15.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "tbz %x[flags], #4, 72f\n"
+      "ldr q0, [x17, #0x0]\n"
+      "ldr q4, [x8, #0x0]\n"
+      "ldr q1, [x17, #0x10]\n"
+      "ldr q5, [x8, #0x10]\n"
+      "ldr q2, [x17, #0x20]\n"
+      "ldr q6, [x8, #0x20]\n"
+      "ldr q3, [x17, #0x30]\n"
+      "ldr q7, [x8, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "add x8, x8, #0x40\n"
+      "b 73f\n"
+      "72:"  // Height 3: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "73:"  // Height 3: parameters loaded
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "tbz %x[flags], #5, 74f\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v4.16b, v12.16b, v0.16b\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v4.4s\n"
+      "and v7.16b, v15.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v6.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "and v6.16b, v18.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "74:"  // Height 3: no shift correction
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x15, #0x10\n"
+      "srshl v12.4s, v12.4s, v0.4s\n"
+      "srshl v13.4s, v13.4s, v1.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v12.8h, v12.8h, v13.8h\n"
+      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 83f\n"
+      "tbz x15, #3, 78f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "tbz x15, #2, 76f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "tbz x15, #1, 75f\n"
+      "st1 { v8.h }[6], [x13], #0x2\n"
+      "st1 { v12.h }[6], [x9], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[14], [x13]\n"
+      "st1 { v12.b }[14], [x9]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "b 82f\n"
+      "75:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[12], [x13]\n"
+      "st1 { v12.b }[12], [x9]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "b 82f\n"
+      "76:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 77f\n"
+      "st1 { v8.h }[4], [x13], #0x2\n"
+      "st1 { v12.h }[4], [x9], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[10], [x13]\n"
+      "st1 { v12.b }[10], [x9]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "b 82f\n"
+      "77:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[8], [x13]\n"
+      "st1 { v12.b }[8], [x9]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "b 82f\n"
+      "78:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 80f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "tbz x15, #1, 79f\n"
+      "st1 { v8.h }[2], [x13], #0x2\n"
+      "st1 { v12.h }[2], [x9], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[6], [x13]\n"
+      "st1 { v12.b }[6], [x9]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "b 82f\n"
+      "79:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[4], [x13]\n"
+      "st1 { v12.b }[4], [x9]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "b 82f\n"
+      "80:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 81f\n"
+      "str h8, [x13], #0x2\n"
+      "str h12, [x9], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "tbz x15, #0, 82f\n"
+      "st1 { v8.b }[2], [x13]\n"
+      "st1 { v12.b }[2], [x9]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "b 82f\n"
+      "81:"  // Height 3: Partial direct writeback: partial_1_0
+      "str b8, [x13, #0x0]\n"
+      "str b12, [x9, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "82:"  // Height 3: Partial direct writeback: Done
+      "b 84f\n"
+      "83:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "84:"  // Height 3: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 59b\n"
+      "b 170f\n"
+      "85:"  // Height 4
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 86f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19\n"
+      "add x25, x25, x19\n"
+      "b 87f\n"
+      "86:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "add x25, x27, x19\n"
+      "87:"  // Height 4: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "88:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "89:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 90f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 91f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 91f\n"
+      "90:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "91:"  // Height 4: input setup done
+      "cmp x11, #0x10\n"
+      "blt 94f\n"
+      "cmp x11, #0x20\n"
+      "blt 93f\n"
+      "92:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      "bge 92b\n"
+      "93:"  // Height 4: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      "94:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 99f\n"
+      "cmp x11, #0x4\n"
+      "blt 96f\n"
+      "95:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "bge 95b\n"
+      "cbz x11, 99f\n"
+      "96:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 97f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "tbz x11, #0, 98f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "b 98f\n"
+      "97:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "98:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "99:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 89b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "ldr q0, [x16, #0x0]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "ldr q1, [x16, #0x10]\n"
+      "add v12.4s, v12.4s, v0.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "ldr q2, [x16, #0x20]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "ldr q3, [x16, #0x30]\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add x16, x16, #0x40\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v14.4s, v14.4s, v2.4s\n"
+      "add v15.4s, v15.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "tbz %x[flags], #4, 100f\n"
+      "ldr q0, [x17, #0x0]\n"
+      "ldr q4, [x8, #0x0]\n"
+      "ldr q1, [x17, #0x10]\n"
+      "ldr q5, [x8, #0x10]\n"
+      "ldr q2, [x17, #0x20]\n"
+      "ldr q6, [x8, #0x20]\n"
+      "ldr q3, [x17, #0x30]\n"
+      "ldr q7, [x8, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "add x8, x8, #0x40\n"
+      "b 101f\n"
+      "100:"  // Height 4: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "101:"  // Height 4: parameters loaded
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+      "tbz %x[flags], #5, 102f\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v4.16b, v12.16b, v0.16b\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v4.4s\n"
+      "and v7.16b, v15.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v6.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "and v6.16b, v18.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "and v4.16b, v20.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "and v5.16b, v21.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v6.16b, v22.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v4.4s\n"
+      "and v7.16b, v23.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v5.4s\n"
+      "sqadd v22.4s, v22.4s, v6.4s\n"
+      "sqadd v23.4s, v23.4s, v7.4s\n"
+      "102:"  // Height 4: no shift correction
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x15, #0x10\n"
+      "srshl v12.4s, v12.4s, v0.4s\n"
+      "srshl v13.4s, v13.4s, v1.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "srshl v21.4s, v21.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "srshl v22.4s, v22.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "srshl v23.4s, v23.4s, v3.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "uzp1 v12.8h, v12.8h, v13.8h\n"
+      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "bge 111f\n"
+      "tbz x15, #3, 106f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x15, #2, 104f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "tbz x15, #1, 103f\n"
+      "st1 { v8.h }[6], [x13], #0x2\n"
+      "st1 { v12.h }[6], [x9], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[14], [x13]\n"
+      "st1 { v12.b }[14], [x9]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "b 110f\n"
+      "103:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[12], [x13]\n"
+      "st1 { v12.b }[12], [x9]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "b 110f\n"
+      "104:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 105f\n"
+      "st1 { v8.h }[4], [x13], #0x2\n"
+      "st1 { v12.h }[4], [x9], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[10], [x13]\n"
+      "st1 { v12.b }[10], [x9]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "b 110f\n"
+      "105:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[8], [x13]\n"
+      "st1 { v12.b }[8], [x9]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "b 110f\n"
+      "106:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 108f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "tbz x15, #1, 107f\n"
+      "st1 { v8.h }[2], [x13], #0x2\n"
+      "st1 { v12.h }[2], [x9], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[6], [x13]\n"
+      "st1 { v12.b }[6], [x9]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "b 110f\n"
+      "107:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[4], [x13]\n"
+      "st1 { v12.b }[4], [x9]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "b 110f\n"
+      "108:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 109f\n"
+      "str h8, [x13], #0x2\n"
+      "str h12, [x9], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "tbz x15, #0, 110f\n"
+      "st1 { v8.b }[2], [x13]\n"
+      "st1 { v12.b }[2], [x9]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "b 110f\n"
+      "109:"  // Height 4: Partial direct writeback: partial_1_0
+      "str b8, [x13, #0x0]\n"
+      "str b12, [x9, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "110:"  // Height 4: Partial direct writeback: Done
+      "b 112f\n"
+      "111:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "112:"  // Height 4: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 87b\n"
+      "b 170f\n"
+      "113:"  // Height 5
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 114f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "b 115f\n"
+      "114:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "add x25, x27, x19\n"
+      "add x23, x25, x19\n"
+      "115:"  // Height 5: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "116:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "117:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 118f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 119f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 119f\n"
+      "118:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "119:"  // Height 5: input setup done
+      "cmp x11, #0x10\n"
+      "blt 122f\n"
+      "cmp x11, #0x20\n"
+      "blt 121f\n"
+      "120:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      "bge 120b\n"
+      "121:"  // Height 5: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      "122:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x11, 127f\n"
+      "cmp x11, #0x4\n"
+      "blt 124f\n"
+      "123:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "bge 123b\n"
+      "cbz x11, 127f\n"
+      "124:"  // Height 5: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 125f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "tbz x11, #0, 126f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "b 126f\n"
+      "125:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "126:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "127:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 117b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "ldr q0, [x16, #0x0]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "ldr q1, [x16, #0x10]\n"
+      "add v12.4s, v12.4s, v0.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "ldr q2, [x16, #0x20]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "ldr q3, [x16, #0x30]\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add x16, x16, #0x40\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v14.4s, v14.4s, v2.4s\n"
+      "add v15.4s, v15.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "tbz %x[flags], #4, 128f\n"
+      "ldr q0, [x17, #0x0]\n"
+      "ldr q4, [x8, #0x0]\n"
+      "ldr q1, [x17, #0x10]\n"
+      "ldr q5, [x8, #0x10]\n"
+      "ldr q2, [x17, #0x20]\n"
+      "ldr q6, [x8, #0x20]\n"
+      "ldr q3, [x17, #0x30]\n"
+      "ldr q7, [x8, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "add x8, x8, #0x40\n"
+      "b 129f\n"
+      "128:"  // Height 5: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "129:"  // Height 5: parameters loaded
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+      "tbz %x[flags], #5, 130f\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v4.16b, v12.16b, v0.16b\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v4.4s\n"
+      "and v7.16b, v15.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v6.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "and v6.16b, v18.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "and v4.16b, v20.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "and v5.16b, v21.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v6.16b, v22.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v4.4s\n"
+      "and v7.16b, v23.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v5.4s\n"
+      "and v4.16b, v24.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v22.4s, v22.4s, v6.4s\n"
+      "and v5.16b, v25.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v7.4s\n"
+      "and v6.16b, v26.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v4.4s\n"
+      "and v7.16b, v27.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v5.4s\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "sqadd v27.4s, v27.4s, v7.4s\n"
+      "130:"  // Height 5: no shift correction
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x15, #0x10\n"
+      "srshl v12.4s, v12.4s, v0.4s\n"
+      "srshl v13.4s, v13.4s, v1.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "srshl v21.4s, v21.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "srshl v22.4s, v22.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "srshl v23.4s, v23.4s, v3.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v25.4s, v25.4s, v1.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v2.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v27.4s, v27.4s, v3.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v12.8h, v12.8h, v13.8h\n"
+      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "bge 139f\n"
+      "tbz x15, #3, 134f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x15, #2, 132f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "tbz x15, #1, 131f\n"
+      "st1 { v8.h }[6], [x13], #0x2\n"
+      "st1 { v12.h }[6], [x9], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "st1 { v24.h }[6], [x23], #0x2\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[14], [x13]\n"
+      "st1 { v12.b }[14], [x9]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "st1 { v24.b }[14], [x23]\n"
+      "b 138f\n"
+      "131:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[12], [x13]\n"
+      "st1 { v12.b }[12], [x9]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "st1 { v24.b }[12], [x23]\n"
+      "b 138f\n"
+      "132:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 133f\n"
+      "st1 { v8.h }[4], [x13], #0x2\n"
+      "st1 { v12.h }[4], [x9], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "st1 { v24.h }[4], [x23], #0x2\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[10], [x13]\n"
+      "st1 { v12.b }[10], [x9]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "st1 { v24.b }[10], [x23]\n"
+      "b 138f\n"
+      "133:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[8], [x13]\n"
+      "st1 { v12.b }[8], [x9]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "st1 { v24.b }[8], [x23]\n"
+      "b 138f\n"
+      "134:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 136f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "tbz x15, #1, 135f\n"
+      "st1 { v8.h }[2], [x13], #0x2\n"
+      "st1 { v12.h }[2], [x9], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "st1 { v24.h }[2], [x23], #0x2\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[6], [x13]\n"
+      "st1 { v12.b }[6], [x9]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "st1 { v24.b }[6], [x23]\n"
+      "b 138f\n"
+      "135:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[4], [x13]\n"
+      "st1 { v12.b }[4], [x9]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "st1 { v24.b }[4], [x23]\n"
+      "b 138f\n"
+      "136:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 137f\n"
+      "str h8, [x13], #0x2\n"
+      "str h12, [x9], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "str h24, [x23], #0x2\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.b }[2], [x13]\n"
+      "st1 { v12.b }[2], [x9]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "st1 { v24.b }[2], [x23]\n"
+      "b 138f\n"
+      "137:"  // Height 5: Partial direct writeback: partial_1_0
+      "str b8, [x13, #0x0]\n"
+      "str b12, [x9, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "str b24, [x23, #0x0]\n"
+      "138:"  // Height 5: Partial direct writeback: Done
+      "b 140f\n"
+      "139:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q24, [x23, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "140:"  // Height 5: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 115b\n"
+      "b 170f\n"
+      "141:"  // Height 6
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 142f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "add x21, x21, x19\n"
+      "b 143f\n"
+      "142:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "add x25, x27, x19\n"
+      "add x23, x25, x19\n"
+      "add x21, x23, x19\n"
+      "add %x[output_ptr], x21, x19\n"
+      "143:"  // Height 6: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "144:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "145:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 146f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 147f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 147f\n"
+      "146:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "147:"  // Height 6: input setup done
+      "cmp x11, #0x10\n"
+      "blt 150f\n"
+      "cmp x11, #0x20\n"
+      "blt 149f\n"
+      "148:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8de  // sdot v30.4s, v6.16b, v5.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
+      "bge 148b\n"
+      "149:"  // Height 6: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8de  // sdot v30.4s, v6.16b, v5.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
+      "150:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x11, 155f\n"
+      "cmp x11, #0x4\n"
+      "blt 152f\n"
+      "151:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x20], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "bge 151b\n"
+      "cbz x11, 155f\n"
+      "152:"  // Height 6: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 153f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x20], #0x2\n"
+      "tbz x11, #0, 154f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x20]\n"
+      "b 154f\n"
+      "153:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x20, #0x0]\n"
+      "154:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "155:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 145b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "ldr q0, [x16, #0x0]\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "ldr q1, [x16, #0x10]\n"
+      "add v12.4s, v12.4s, v0.4s\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "ldr q2, [x16, #0x20]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "ldr q3, [x16, #0x30]\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x16, x16, #0x40\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v0.4s\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v13.4s, v13.4s, v1.4s\n"
+      "add v14.4s, v14.4s, v2.4s\n"
+      "add v15.4s, v15.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "add v29.4s, v29.4s, v1.4s\n"
+      "add v30.4s, v30.4s, v2.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "tbz %x[flags], #4, 156f\n"
+      "ldr q0, [x17, #0x0]\n"
+      "ldr q4, [x8, #0x0]\n"
+      "ldr q1, [x17, #0x10]\n"
+      "ldr q5, [x8, #0x10]\n"
+      "ldr q2, [x17, #0x20]\n"
+      "ldr q6, [x8, #0x20]\n"
+      "ldr q3, [x17, #0x30]\n"
+      "ldr q7, [x8, #0x30]\n"
+      "add x17, x17, #0x40\n"
+      "add x8, x8, #0x40\n"
+      "b 157f\n"
+      "156:"  // Height 6: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "157:"  // Height 6: parameters loaded
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v5.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v6.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v7.4s\n"
+      "tbz %x[flags], #5, 158f\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v4.16b, v12.16b, v0.16b\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v12.4s, v12.4s, v4.4s\n"
+      "and v7.16b, v15.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v6.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v7.4s\n"
+      "and v6.16b, v18.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "and v4.16b, v20.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "and v5.16b, v21.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v6.16b, v22.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v4.4s\n"
+      "and v7.16b, v23.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v21.4s, v21.4s, v5.4s\n"
+      "and v4.16b, v24.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v22.4s, v22.4s, v6.4s\n"
+      "and v5.16b, v25.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v7.4s\n"
+      "and v6.16b, v26.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v4.4s\n"
+      "and v7.16b, v27.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v5.4s\n"
+      "and v4.16b, v28.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "and v5.16b, v29.16b, v1.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v27.4s, v27.4s, v7.4s\n"
+      "and v6.16b, v30.16b, v2.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v4.4s\n"
+      "and v7.16b, v31.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v29.4s, v29.4s, v5.4s\n"
+      "sqadd v30.4s, v30.4s, v6.4s\n"
+      "sqadd v31.4s, v31.4s, v7.4s\n"
+      "158:"  // Height 6: no shift correction
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x15, #0x10\n"
+      "srshl v12.4s, v12.4s, v0.4s\n"
+      "srshl v13.4s, v13.4s, v1.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "srshl v21.4s, v21.4s, v1.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "srshl v22.4s, v22.4s, v2.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "srshl v23.4s, v23.4s, v3.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v25.4s, v25.4s, v1.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v2.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v27.4s, v27.4s, v3.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "srshl v29.4s, v29.4s, v1.4s\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "srshl v30.4s, v30.4s, v2.4s\n"
+      "smin v28.4s, v28.4s, v6.4s\n"
+      "smin v29.4s, v29.4s, v6.4s\n"
+      "srshl v31.4s, v31.4s, v3.4s\n"
+      "smax v28.4s, v28.4s, v5.4s\n"
+      "smax v29.4s, v29.4s, v5.4s\n"
+      "add v30.4s, v30.4s, v4.4s\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smin v30.4s, v30.4s, v6.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smax v30.4s, v30.4s, v5.4s\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
+      "uzp1 v12.8h, v12.8h, v13.8h\n"
+      "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v28.8h, v28.8h, v29.8h\n"
+      "uzp1 v29.8h, v30.8h, v31.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v12.16b, v12.16b, v13.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "bge 167f\n"
+      "tbz x15, #3, 162f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x15, #2, 160f\n"
+      "st1 { v8.s }[2], [x13], #0x4\n"
+      "st1 { v12.s }[2], [x9], #0x4\n"
+      "st1 { v16.s }[2], [x27], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x15, #1, 159f\n"
+      "st1 { v8.h }[6], [x13], #0x2\n"
+      "st1 { v12.h }[6], [x9], #0x2\n"
+      "st1 { v16.h }[6], [x27], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "st1 { v24.h }[6], [x23], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[14], [x13]\n"
+      "st1 { v12.b }[14], [x9]\n"
+      "st1 { v16.b }[14], [x27]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "st1 { v24.b }[14], [x23]\n"
+      "st1 { v28.b }[14], [x21]\n"
+      "b 166f\n"
+      "159:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[12], [x13]\n"
+      "st1 { v12.b }[12], [x9]\n"
+      "st1 { v16.b }[12], [x27]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "st1 { v24.b }[12], [x23]\n"
+      "st1 { v28.b }[12], [x21]\n"
+      "b 166f\n"
+      "160:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 161f\n"
+      "st1 { v8.h }[4], [x13], #0x2\n"
+      "st1 { v12.h }[4], [x9], #0x2\n"
+      "st1 { v16.h }[4], [x27], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "st1 { v24.h }[4], [x23], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[10], [x13]\n"
+      "st1 { v12.b }[10], [x9]\n"
+      "st1 { v16.b }[10], [x27]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "st1 { v24.b }[10], [x23]\n"
+      "st1 { v28.b }[10], [x21]\n"
+      "b 166f\n"
+      "161:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[8], [x13]\n"
+      "st1 { v12.b }[8], [x9]\n"
+      "st1 { v16.b }[8], [x27]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "st1 { v24.b }[8], [x23]\n"
+      "st1 { v28.b }[8], [x21]\n"
+      "b 166f\n"
+      "162:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 164f\n"
+      "str s8, [x13], #0x4\n"
+      "str s12, [x9], #0x4\n"
+      "str s16, [x27], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x15, #1, 163f\n"
+      "st1 { v8.h }[2], [x13], #0x2\n"
+      "st1 { v12.h }[2], [x9], #0x2\n"
+      "st1 { v16.h }[2], [x27], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "st1 { v24.h }[2], [x23], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[6], [x13]\n"
+      "st1 { v12.b }[6], [x9]\n"
+      "st1 { v16.b }[6], [x27]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "st1 { v24.b }[6], [x23]\n"
+      "st1 { v28.b }[6], [x21]\n"
+      "b 166f\n"
+      "163:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[4], [x13]\n"
+      "st1 { v12.b }[4], [x9]\n"
+      "st1 { v16.b }[4], [x27]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "st1 { v24.b }[4], [x23]\n"
+      "st1 { v28.b }[4], [x21]\n"
+      "b 166f\n"
+      "164:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 165f\n"
+      "str h8, [x13], #0x2\n"
+      "str h12, [x9], #0x2\n"
+      "str h16, [x27], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "str h24, [x23], #0x2\n"
+      "str h28, [x21], #0x2\n"
+      "tbz x15, #0, 166f\n"
+      "st1 { v8.b }[2], [x13]\n"
+      "st1 { v12.b }[2], [x9]\n"
+      "st1 { v16.b }[2], [x27]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "st1 { v24.b }[2], [x23]\n"
+      "st1 { v28.b }[2], [x21]\n"
+      "b 166f\n"
+      "165:"  // Height 6: Partial direct writeback: partial_1_0
+      "str b8, [x13, #0x0]\n"
+      "str b12, [x9, #0x0]\n"
+      "str b16, [x27, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "str b24, [x23, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
+      "166:"  // Height 6: Partial direct writeback: Done
+      "b 168f\n"
+      "167:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
+      "add x13, x13, #0x10\n"
+      "add x9, x9, #0x10\n"
+      "add x27, x27, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x21, x21, #0x10\n"
+      "168:"  // Height 6: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 143b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 170f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 169f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "169:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "170:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
deleted file mode 100644
index 4a7cdc59a7..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
+++ /dev/null
@@ -1,2434 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long blocks_count = K / 4;
-    const long odds_count = K - (blocks_count * 4);
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const int8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(int8_t);
-
-        int32_t *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            long odds = odds_count;
-            const int8_t *a_ptr0 = a_ptr0_base;
-            const int8_t *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            int32_t result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
-            int32_t *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "temploadreg0 .req X0\n"
-                        "temploadreg1 .req X1\n"
-                        "temploadreg2 .req X2\n"
-                        "temploadreg3 .req X3\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "temploadreg0 .req X2\n"
-                        "temploadreg1 .req X3\n"
-                        "temploadreg2 .req X4\n"
-                        "temploadreg3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "temploadreg0 .req X4\n"
-                        "temploadreg1 .req X5\n"
-                        "temploadreg2 .req X6\n"
-                        "temploadreg3 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "temploadreg0 .req X6\n"
-                        "temploadreg1 .req X7\n"
-                        "temploadreg2 .req X8\n"
-                        "temploadreg3 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v28.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr d3, [a_ptr3, #-0x10]\n"
-                        ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[0], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[1], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "ld1 {v3.b}[2], [a_ptr3]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
deleted file mode 100644
index da39a32690..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
+++ /dev/null
@@ -1,1808 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long blocks_count = K / 4;
-    const long odds_count = K - (blocks_count * 4);
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const int8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(int8_t);
-
-        int32_t *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            long odds = odds_count;
-            const int8_t *a_ptr0 = a_ptr0_base;
-            const int8_t *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            int32_t result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
-            int32_t *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v27.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v28.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[0], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[1], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "ld1 {v3.b}[2], [a_ptr3]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
new file mode 100644
index 0000000000..16a6f9213a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<int8_t>, \
+   size_t, size_t, \
+   const int8_t *, \
+   IndirectOutputArg<int32_t>, \
+   const int32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_s8s32_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_s8s32_dot_6x16
+{
+public:
+    typedef int8_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_s8s32_dot_6x16;
+
+    cls_a64_hybrid_s8s32_dot_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..3257986410
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
@@ -0,0 +1,3335 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_dot_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+    const int32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 176f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 141f\n"
+      "beq 106f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 71f\n"
+      "beq 36f\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "tbz %x[flags], #0, 13f\n"
+      "cmp x15, #0x10\n"
+      "bge 12f\n"
+      "tbz x15, #3, 7f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x15, #2, 5f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 4f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 11f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 11f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 11f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x15, #1, 6f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 11f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 11f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 11f\n"
+      "7:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x15, #2, 9f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 8f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 11f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 11f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "b 11f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x15, #1, 10f\n"
+      "ldr d8, [x13], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "b 11f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "11:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 14f\n"
+      "12:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "b 14f\n"
+      "13:"  // Height 1: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "14:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "15:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 16f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 17f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "b 17f\n"
+      "16:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "17:"  // Height 1: input setup done
+      "cmp x11, #0x10\n"
+      "blt 20f\n"
+      "cmp x11, #0x20\n"
+      "blt 19f\n"
+      "18:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "bge 18b\n"
+      "19:"  // Height 1: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      "20:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 25f\n"
+      "cmp x11, #0x4\n"
+      "blt 22f\n"
+      "21:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "cmp x11, #0x4\n"
+      "bge 21b\n"
+      "cbz x11, 25f\n"
+      "22:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 23f\n"
+      "ldr h0, [x10], #0x2\n"
+      "tbz x11, #0, 24f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "b 24f\n"
+      "23:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      "25:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 15b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "bge 34f\n"
+      "tbz x15, #3, 29f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x15, #2, 27f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 26f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 33f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 33f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 33f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 28f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 33f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 33f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 33f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 31f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 30f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 33f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 33f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 33f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 32f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 33f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "33:"  // Height 1: Partial direct writeback: Done
+      "b 35f\n"
+      "34:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "35:"  // Height 1: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 3b\n"
+      "b 212f\n"
+      "36:"  // Height 2
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 37f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 38f\n"
+      "37:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "38:"  // Height 2: Column loop
+      "tbz %x[flags], #0, 48f\n"
+      "cmp x15, #0x10\n"
+      "bge 47f\n"
+      "tbz x15, #3, 42f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "tbz x15, #2, 40f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 39f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "b 46f\n"
+      "39:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 46f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "b 46f\n"
+      "40:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x15, #1, 41f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "b 46f\n"
+      "41:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 46f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "b 46f\n"
+      "42:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x15, #2, 44f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 43f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "b 46f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 46f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "b 46f\n"
+      "44:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x15, #1, 45f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "b 46f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "46:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "b 49f\n"
+      "47:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "b 49f\n"
+      "48:"  // Height 2: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "49:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "50:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 51f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "b 52f\n"
+      "51:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "52:"  // Height 2: input setup done
+      "cmp x11, #0x10\n"
+      "blt 55f\n"
+      "cmp x11, #0x20\n"
+      "blt 54f\n"
+      "53:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      "bge 53b\n"
+      "54:"  // Height 2: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      "55:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 60f\n"
+      "cmp x11, #0x4\n"
+      "blt 57f\n"
+      "56:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "bge 56b\n"
+      "cbz x11, 60f\n"
+      "57:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 58f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "tbz x11, #0, 59f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "b 59f\n"
+      "58:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "59:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      "60:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 50b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "bge 69f\n"
+      "tbz x15, #3, 64f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "tbz x15, #2, 62f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 61f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "b 68f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 68f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "b 68f\n"
+      "62:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 63f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "b 68f\n"
+      "63:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 68f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "b 68f\n"
+      "64:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 66f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 65f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "b 68f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 68f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "b 68f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 67f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "b 68f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "68:"  // Height 2: Partial direct writeback: Done
+      "b 70f\n"
+      "69:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "70:"  // Height 2: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 38b\n"
+      "b 212f\n"
+      "71:"  // Height 3
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 73f\n"
+      "72:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "73:"  // Height 3: Column loop
+      "tbz %x[flags], #0, 83f\n"
+      "cmp x15, #0x10\n"
+      "bge 82f\n"
+      "tbz x15, #3, 77f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "tbz x15, #2, 75f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 74f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "b 81f\n"
+      "74:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 81f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "b 81f\n"
+      "75:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x15, #1, 76f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "b 81f\n"
+      "76:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 81f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "b 81f\n"
+      "77:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x15, #2, 79f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 78f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "b 81f\n"
+      "78:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 81f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "b 81f\n"
+      "79:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x15, #1, 80f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "b 81f\n"
+      "80:"  // Height 3: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "81:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "b 84f\n"
+      "82:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "b 84f\n"
+      "83:"  // Height 3: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "84:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "85:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 86f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 87f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "b 87f\n"
+      "86:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "87:"  // Height 3: input setup done
+      "cmp x11, #0x10\n"
+      "blt 90f\n"
+      "cmp x11, #0x20\n"
+      "blt 89f\n"
+      "88:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      "bge 88b\n"
+      "89:"  // Height 3: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      "90:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 95f\n"
+      "cmp x11, #0x4\n"
+      "blt 92f\n"
+      "91:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "bge 91b\n"
+      "cbz x11, 95f\n"
+      "92:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 93f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "tbz x11, #0, 94f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "b 94f\n"
+      "93:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "94:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      "95:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 85b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "bge 104f\n"
+      "tbz x15, #3, 99f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "tbz x15, #2, 97f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 96f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "b 103f\n"
+      "96:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 103f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "b 103f\n"
+      "97:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 98f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "b 103f\n"
+      "98:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 103f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "b 103f\n"
+      "99:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 101f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 100f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "b 103f\n"
+      "100:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 103f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "b 103f\n"
+      "101:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 102f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "b 103f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "103:"  // Height 3: Partial direct writeback: Done
+      "b 105f\n"
+      "104:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "105:"  // Height 3: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 73b\n"
+      "b 212f\n"
+      "106:"  // Height 4
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 107f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 108f\n"
+      "107:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "108:"  // Height 4: Column loop
+      "tbz %x[flags], #0, 118f\n"
+      "cmp x15, #0x10\n"
+      "bge 117f\n"
+      "tbz x15, #3, 112f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "tbz x15, #2, 110f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 109f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "b 116f\n"
+      "109:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 116f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "b 116f\n"
+      "110:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x15, #1, 111f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "b 116f\n"
+      "111:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 116f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "b 116f\n"
+      "112:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x15, #2, 114f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 113f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "b 116f\n"
+      "113:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 116f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "b 116f\n"
+      "114:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x15, #1, 115f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "b 116f\n"
+      "115:"  // Height 4: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "116:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "b 119f\n"
+      "117:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "b 119f\n"
+      "118:"  // Height 4: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "119:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "120:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 121f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 122f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 122f\n"
+      "121:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "122:"  // Height 4: input setup done
+      "cmp x11, #0x10\n"
+      "blt 125f\n"
+      "cmp x11, #0x20\n"
+      "blt 124f\n"
+      "123:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      "bge 123b\n"
+      "124:"  // Height 4: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      "125:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 130f\n"
+      "cmp x11, #0x4\n"
+      "blt 127f\n"
+      "126:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "bge 126b\n"
+      "cbz x11, 130f\n"
+      "127:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 128f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "tbz x11, #0, 129f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "b 129f\n"
+      "128:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "129:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      "130:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 120b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "bge 139f\n"
+      "tbz x15, #3, 134f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "tbz x15, #2, 132f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 131f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "b 138f\n"
+      "131:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 138f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "b 138f\n"
+      "132:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 133f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "b 138f\n"
+      "133:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 138f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "b 138f\n"
+      "134:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 136f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 135f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "b 138f\n"
+      "135:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 138f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "b 138f\n"
+      "136:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 137f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "b 138f\n"
+      "137:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "138:"  // Height 4: Partial direct writeback: Done
+      "b 140f\n"
+      "139:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "140:"  // Height 4: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 108b\n"
+      "b 212f\n"
+      "141:"  // Height 5
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 142f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 143f\n"
+      "142:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "143:"  // Height 5: Column loop
+      "tbz %x[flags], #0, 153f\n"
+      "cmp x15, #0x10\n"
+      "bge 152f\n"
+      "tbz x15, #3, 147f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "tbz x15, #2, 145f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 144f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "b 151f\n"
+      "144:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 151f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "b 151f\n"
+      "145:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x15, #1, 146f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "b 151f\n"
+      "146:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 151f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "b 151f\n"
+      "147:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x15, #2, 149f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 148f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "b 151f\n"
+      "148:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 151f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "b 151f\n"
+      "149:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x15, #1, 150f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "b 151f\n"
+      "150:"  // Height 5: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "151:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "b 154f\n"
+      "152:"  // Height 5: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "b 154f\n"
+      "153:"  // Height 5: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "154:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "155:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 156f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 157f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 157f\n"
+      "156:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "157:"  // Height 5: input setup done
+      "cmp x11, #0x10\n"
+      "blt 160f\n"
+      "cmp x11, #0x20\n"
+      "blt 159f\n"
+      "158:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      "bge 158b\n"
+      "159:"  // Height 5: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      "160:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x11, 165f\n"
+      "cmp x11, #0x4\n"
+      "blt 162f\n"
+      "161:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "bge 161b\n"
+      "cbz x11, 165f\n"
+      "162:"  // Height 5: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 163f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "tbz x11, #0, 164f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "b 164f\n"
+      "163:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "164:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      "165:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 155b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "bge 174f\n"
+      "tbz x15, #3, 169f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "tbz x15, #2, 167f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 166f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "b 173f\n"
+      "166:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 173f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "b 173f\n"
+      "167:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 168f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "b 173f\n"
+      "168:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 173f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "b 173f\n"
+      "169:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 171f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 170f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "b 173f\n"
+      "170:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 173f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "b 173f\n"
+      "171:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 172f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "b 173f\n"
+      "172:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "173:"  // Height 5: Partial direct writeback: Done
+      "b 175f\n"
+      "174:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "175:"  // Height 5: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 143b\n"
+      "b 212f\n"
+      "176:"  // Height 6
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 177f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 178f\n"
+      "177:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "178:"  // Height 6: Column loop
+      "tbz %x[flags], #0, 188f\n"
+      "cmp x15, #0x10\n"
+      "bge 187f\n"
+      "tbz x15, #3, 182f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x15, #2, 180f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 179f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 186f\n"
+      "179:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 186f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 186f\n"
+      "180:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x15, #1, 181f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 186f\n"
+      "181:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 186f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 186f\n"
+      "182:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x15, #2, 184f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 183f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 186f\n"
+      "183:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 186f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 186f\n"
+      "184:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x15, #1, 185f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 186f\n"
+      "185:"  // Height 6: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "186:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "sub x21, x21, x19\n"
+      "b 189f\n"
+      "187:"  // Height 6: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "b 189f\n"
+      "188:"  // Height 6: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "189:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "190:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 191f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 192f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 192f\n"
+      "191:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "192:"  // Height 6: input setup done
+      "cmp x11, #0x10\n"
+      "blt 195f\n"
+      "cmp x11, #0x20\n"
+      "blt 194f\n"
+      "193:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8de  // sdot v30.4s, v6.16b, v5.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
+      "bge 193b\n"
+      "194:"  // Height 6: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x4fa0e0c8  // sdot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0cc  // sdot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d0  // sdot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d4  // sdot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0d8  // sdot v24.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0dc  // sdot v28.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x4fa0e0e9  // sdot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ed  // sdot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f1  // sdot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f5  // sdot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0f9  // sdot v25.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0fd  // sdot v29.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x4fa0e0ca  // sdot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ce  // sdot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0d2  // sdot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0d6  // sdot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0da  // sdot v26.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0de  // sdot v30.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4fa1e0ef  // sdot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4fa2e0f3  // sdot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x4fa3e0f7  // sdot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x4fa4e0fb  // sdot v27.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x4fa5e0ff  // sdot v31.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x4f80e8c8  // sdot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8cc  // sdot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d4  // sdot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8d8  // sdot v24.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8dc  // sdot v28.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x4f80e8e9  // sdot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ed  // sdot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f1  // sdot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f5  // sdot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8f9  // sdot v25.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8fd  // sdot v29.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x4f80e8ca  // sdot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ce  // sdot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8d2  // sdot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8d6  // sdot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8da  // sdot v26.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8de  // sdot v30.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x4f80e8eb  // sdot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4f81e8ef  // sdot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4f82e8f3  // sdot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x4f83e8f7  // sdot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x4f84e8fb  // sdot v27.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x4f85e8ff  // sdot v31.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x4fa0e8c8  // sdot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8cc  // sdot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d0  // sdot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d4  // sdot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8d8  // sdot v24.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8dc  // sdot v28.4s, v6.16b, v5.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x4fa0e8e9  // sdot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ed  // sdot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f1  // sdot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f5  // sdot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8f9  // sdot v25.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8fd  // sdot v29.4s, v7.16b, v5.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x4fa0e8ca  // sdot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x4fa1e8ce  // sdot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8d2  // sdot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8d6  // sdot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8da  // sdot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8de  // sdot v30.4s, v6.16b, v5.4b[3]\n"
+      ".inst 0x4fa0e8eb  // sdot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4fa1e8ef  // sdot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4fa2e8f3  // sdot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x4fa3e8f7  // sdot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x4fa4e8fb  // sdot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x4fa5e8ff  // sdot v31.4s, v7.16b, v5.4b[3]\n"
+      "195:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x11, 200f\n"
+      "cmp x11, #0x4\n"
+      "blt 197f\n"
+      "196:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x20], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "bge 196b\n"
+      "cbz x11, 200f\n"
+      "197:"  // Height 6: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 198f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x20], #0x2\n"
+      "tbz x11, #0, 199f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x20]\n"
+      "b 199f\n"
+      "198:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x20, #0x0]\n"
+      "199:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x4f80e0c8  // sdot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x4f81e0cc  // sdot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d0  // sdot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d4  // sdot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0d8  // sdot v24.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0dc  // sdot v28.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x4f80e0e9  // sdot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ed  // sdot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f1  // sdot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f5  // sdot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0f9  // sdot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0fd  // sdot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x4f81e0ce  // sdot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0d2  // sdot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0da  // sdot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0de  // sdot v30.4s, v6.16b, v5.4b[0]\n"
+      ".inst 0x4f80e0eb  // sdot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4f81e0ef  // sdot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4f82e0f3  // sdot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x4f83e0f7  // sdot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x4f84e0fb  // sdot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x4f85e0ff  // sdot v31.4s, v7.16b, v5.4b[0]\n"
+      "200:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 190b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "bge 209f\n"
+      "tbz x15, #3, 204f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "tbz x15, #2, 202f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 201f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 208f\n"
+      "201:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 208f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "b 208f\n"
+      "202:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 203f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "b 208f\n"
+      "203:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 208f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "b 208f\n"
+      "204:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 206f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 205f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "b 208f\n"
+      "205:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 208f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "b 208f\n"
+      "206:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 207f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "b 208f\n"
+      "207:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "208:"  // Height 6: Partial direct writeback: Done
+      "b 210f\n"
+      "209:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "add x21, x21, #0x40\n"
+      "210:"  // Height 6: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 178b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 212f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 211f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "211:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "212:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
similarity index 60%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
index e5a88b4519..5b4a7f3e86 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,38 +10,43 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __aarch64__
 
-#include <cstdint>
 #include "../std_transforms_fixed.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<uint8_t>, \
+   size_t, size_t, \
+   const uint8_t *, \
+   IndirectOutputArg<uint8_t>, \
+   const Requantize32 *, const int32_t *, unsigned int
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_hybrid_u8u32_dot_16x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_hybrid_u8qa_dot_4x16( ARGLIST );
 
-class hybrid_u8u32_dot_16x4
+class cls_a64_hybrid_u8qa_dot_4x16
 {
 public:
     typedef uint8_t operand_type;
-    typedef uint32_t result_type;
+    typedef uint8_t result_type;
 
-    typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
@@ -60,16 +65,6 @@ class hybrid_u8u32_dot_16x4
     }
 
     static constexpr bool supports_accumulate()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_activation()
     {
         return false;
     }
@@ -77,16 +72,14 @@ class hybrid_u8u32_dot_16x4
     StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_hybrid_u8u32_dot_16x4;
+    kern_type kernel=a64_hybrid_u8qa_dot_4x16;
 
-    hybrid_u8u32_dot_16x4(const CPUInfo *ci)
+    cls_a64_hybrid_u8qa_dot_4x16(const CPUInfo *)
     {
-        if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_hybrid_u8u32_dot_16x4_a55;
-        }
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
new file mode 100644
index 0000000000..ff12472063
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
@@ -0,0 +1,2072 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8qa_dot_4x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 94f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 63f\n"
+      "beq 32f\n"
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "add x9, x9, x19\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "blt 12f\n"
+      "cmp x27, #0x20\n"
+      "blt 10f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      "ldr q6, [x11, #0x20]\n"
+      ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x30]\n"
+      ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q8, [x11, #0x40]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
+      ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
+      "ldr q5, [x11, #0x80]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      "tbnz %x[flags], #31, 9f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "9:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 8b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      "ldr q8, [x11, #0x20]\n"
+      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x30]\n"
+      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x40]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x6fa0e150  // udot v16.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x6fa0e091  // udot v17.4s, v4.16b, v0.4b[1]\n"
+      "ldr q7, [x11, #0x80]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x6fa0e0b2  // udot v18.4s, v5.16b, v0.4b[1]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x6fa0e0d3  // udot v19.4s, v6.16b, v0.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x6f80e8f0  // udot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f80e911  // udot v17.4s, v8.16b, v0.4b[2]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x6f80e932  // udot v18.4s, v9.16b, v0.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f80e953  // udot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e890  // udot v16.4s, v4.16b, v0.4b[3]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6fa0e8b1  // udot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa0e8d2  // udot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa0e8f3  // udot v19.4s, v7.16b, v0.4b[3]\n"
+      "tbnz %x[flags], #31, 11f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "11:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "12:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 19f\n"
+      "cmp x27, #0x4\n"
+      "blt 15f\n"
+      "13:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "tbnz %x[flags], #31, 14f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "14:"  // Height 1: Multiply loop: unique 3: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x6f80e110  // udot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      "ldr q10, [x11, #0x20]\n"
+      ".inst 0x6f80e131  // udot v17.4s, v9.16b, v0.4b[0]\n"
+      "ldr q4, [x11, #0x30]\n"
+      ".inst 0x6f80e152  // udot v18.4s, v10.16b, v0.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      "cmp x27, #0x4\n"
+      "bge 13b\n"
+      "cbz x27, 19f\n"
+      "15:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 16f\n"
+      "ldr h0, [x26], #0x2\n"
+      "tbz x27, #0, 17f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "b 17f\n"
+      "16:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "17:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 18f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "18:"  // Height 1: Multiply loop: unique 4: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x6f80e0b0  // udot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      "ldr q7, [x11, #0x20]\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      "ldr q8, [x11, #0x30]\n"
+      ".inst 0x6f80e0f2  // udot v18.4s, v7.16b, v0.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f80e113  // udot v19.4s, v8.16b, v0.4b[0]\n"
+      "19:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 5b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbnz %x[flags], #31, 20f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "neg v1.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v1.4s\n"
+      "20:"  // Height 1: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add x10, x10, #0x40\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "tbz %x[flags], #5, 21f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "21:"  // Height 1: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 30f\n"
+      "tbz x12, #3, 25f\n"
+      "str d16, [x9], #0x8\n"
+      "tbz x12, #2, 23f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "tbz x12, #1, 22f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "b 29f\n"
+      "22:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "b 29f\n"
+      "23:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 24f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "b 29f\n"
+      "24:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "b 29f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 27f\n"
+      "str s16, [x9], #0x4\n"
+      "tbz x12, #1, 26f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "b 29f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "b 29f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 28f\n"
+      "str h16, [x9], #0x2\n"
+      "tbz x12, #0, 29f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "b 29f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "29:"  // Height 1: Partial direct writeback: Done
+      "b 31f\n"
+      "30:"  // Height 1: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "31:"  // Height 1: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 3b\n"
+      "b 126f\n"
+      "32:"  // Height 2
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 33f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "add x25, x25, x19\n"
+      "b 34f\n"
+      "33:"  // Height 2: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "34:"  // Height 2: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "35:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "36:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 37f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x28, 38f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 38f\n"
+      "37:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "38:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "blt 43f\n"
+      "cmp x27, #0x20\n"
+      "blt 41f\n"
+      "39:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x20]\n"
+      "ldr q7, [x11, #0x30]\n"
+      ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q8, [x11, #0x40]\n"
+      ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x80]\n"
+      ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      "tbnz %x[flags], #31, 40f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      "40:"  // Height 2: Multiply loop: unique 5: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x20\n"
+      "bge 39b\n"
+      "41:"  // Height 2: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x20]\n"
+      "ldr q9, [x11, #0x30]\n"
+      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x40]\n"
+      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x80]\n"
+      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x6fa0e150  // udot v16.4s, v10.16b, v0.4b[1]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x6fa1e154  // udot v20.4s, v10.16b, v1.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      ".inst 0x6fa0e091  // udot v17.4s, v4.16b, v0.4b[1]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6fa1e095  // udot v21.4s, v4.16b, v1.4b[1]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x6fa0e0b2  // udot v18.4s, v5.16b, v0.4b[1]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6fa1e0b6  // udot v22.4s, v5.16b, v1.4b[1]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      ".inst 0x6fa0e0d3  // udot v19.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0d7  // udot v23.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x6f80e8f0  // udot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8f4  // udot v20.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      ".inst 0x6f80e911  // udot v17.4s, v8.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6f81e915  // udot v21.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x6f80e932  // udot v18.4s, v9.16b, v0.4b[2]\n"
+      ".inst 0x6f81e936  // udot v22.4s, v9.16b, v1.4b[2]\n"
+      ".inst 0x6f80e953  // udot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x6f81e957  // udot v23.4s, v10.16b, v1.4b[2]\n"
+      ".inst 0x6fa0e890  // udot v16.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e894  // udot v20.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e8b1  // udot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8b5  // udot v21.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e8d2  // udot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8d6  // udot v22.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e8f3  // udot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8f7  // udot v23.4s, v7.16b, v1.4b[3]\n"
+      "tbnz %x[flags], #31, 42f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      "42:"  // Height 2: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "43:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 50f\n"
+      "cmp x27, #0x4\n"
+      "blt 46f\n"
+      "44:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "ldr s1, [x24], #0x4\n"
+      "tbnz %x[flags], #31, 45f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      "45:"  // Height 2: Multiply loop: unique 7: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x6f80e110  // udot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      ".inst 0x6f81e114  // udot v20.4s, v8.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x20]\n"
+      "ldr q4, [x11, #0x30]\n"
+      ".inst 0x6f80e131  // udot v17.4s, v9.16b, v0.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x6f81e135  // udot v21.4s, v9.16b, v1.4b[0]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6f80e152  // udot v18.4s, v10.16b, v0.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f81e156  // udot v22.4s, v10.16b, v1.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      "bge 44b\n"
+      "cbz x27, 50f\n"
+      "46:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 47f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x24], #0x2\n"
+      "tbz x27, #0, 48f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "b 48f\n"
+      "47:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x24, #0x0]\n"
+      "48:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 49f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      "49:"  // Height 2: Multiply loop: unique 8: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x6f80e0b0  // udot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x6f81e0b4  // udot v20.4s, v5.16b, v1.4b[0]\n"
+      "ldr q7, [x11, #0x20]\n"
+      "ldr q8, [x11, #0x30]\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e0f2  // udot v18.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0f6  // udot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f80e113  // udot v19.4s, v8.16b, v0.4b[0]\n"
+      ".inst 0x6f81e117  // udot v23.4s, v8.16b, v1.4b[0]\n"
+      "50:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 36b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbnz %x[flags], #31, 51f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v2.4s }, [x19]\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "neg v2.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v2.4s\n"
+      "mul v12.4s, v12.4s, v2.4s\n"
+      "51:"  // Height 2: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "add v20.4s, v20.4s, v12.4s\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v22.4s, v22.4s, v12.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add v23.4s, v23.4s, v12.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "tbz %x[flags], #5, 52f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "and v8.16b, v20.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v9.16b, v21.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v8.4s\n"
+      "sqadd v21.4s, v21.4s, v9.4s\n"
+      "sqadd v22.4s, v22.4s, v10.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "52:"  // Height 2: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "bge 61f\n"
+      "tbz x12, #3, 56f\n"
+      "str d16, [x9], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x12, #2, 54f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "tbz x12, #1, 53f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "b 60f\n"
+      "53:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "b 60f\n"
+      "54:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 55f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "b 60f\n"
+      "55:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "b 60f\n"
+      "56:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 58f\n"
+      "str s16, [x9], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "tbz x12, #1, 57f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "b 60f\n"
+      "57:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "b 60f\n"
+      "58:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 59f\n"
+      "str h16, [x9], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "tbz x12, #0, 60f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "b 60f\n"
+      "59:"  // Height 2: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "60:"  // Height 2: Partial direct writeback: Done
+      "b 62f\n"
+      "61:"  // Height 2: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "62:"  // Height 2: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 34b\n"
+      "b 126f\n"
+      "63:"  // Height 3
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 64f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "b 65f\n"
+      "64:"  // Height 3: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "65:"  // Height 3: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "66:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "67:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 68f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x28, 69f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 69f\n"
+      "68:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "69:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "blt 74f\n"
+      "cmp x27, #0x20\n"
+      "blt 72f\n"
+      "70:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x20]\n"
+      ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
+      "ldr q7, [x11, #0x30]\n"
+      "ldr q8, [x11, #0x40]\n"
+      ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x80]\n"
+      ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e938  // udot v24.4s, v9.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e959  // udot v25.4s, v10.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e89a  // udot v26.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8bb  // udot v27.4s, v5.16b, v2.4b[3]\n"
+      "tbnz %x[flags], #31, 71f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "71:"  // Height 3: Multiply loop: unique 9: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "bge 70b\n"
+      "72:"  // Height 3: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x20]\n"
+      ".inst 0x6f82e0d8  // udot v24.4s, v6.16b, v2.4b[0]\n"
+      "ldr q9, [x11, #0x30]\n"
+      "ldr q10, [x11, #0x40]\n"
+      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      ".inst 0x6f82e0f9  // udot v25.4s, v7.16b, v2.4b[0]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x80]\n"
+      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f82e11a  // udot v26.4s, v8.16b, v2.4b[0]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6f82e13b  // udot v27.4s, v9.16b, v2.4b[0]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x6fa0e150  // udot v16.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e154  // udot v20.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e158  // udot v24.4s, v10.16b, v2.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      ".inst 0x6fa0e091  // udot v17.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e095  // udot v21.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e099  // udot v25.4s, v4.16b, v2.4b[1]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x6fa0e0b2  // udot v18.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0b6  // udot v22.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0ba  // udot v26.4s, v5.16b, v2.4b[1]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      ".inst 0x6fa0e0d3  // udot v19.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0d7  // udot v23.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0db  // udot v27.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x6f80e8f0  // udot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8f4  // udot v20.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f8  // udot v24.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      ".inst 0x6f80e911  // udot v17.4s, v8.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6f81e915  // udot v21.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x6f82e919  // udot v25.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x6f80e932  // udot v18.4s, v9.16b, v0.4b[2]\n"
+      ".inst 0x6f81e936  // udot v22.4s, v9.16b, v1.4b[2]\n"
+      ".inst 0x6f82e93a  // udot v26.4s, v9.16b, v2.4b[2]\n"
+      ".inst 0x6f80e953  // udot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x6f81e957  // udot v23.4s, v10.16b, v1.4b[2]\n"
+      ".inst 0x6f82e95b  // udot v27.4s, v10.16b, v2.4b[2]\n"
+      ".inst 0x6fa0e890  // udot v16.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e894  // udot v20.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e898  // udot v24.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e8b1  // udot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8b5  // udot v21.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8b9  // udot v25.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e8d2  // udot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8d6  // udot v22.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8da  // udot v26.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e8f3  // udot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8f7  // udot v23.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8fb  // udot v27.4s, v7.16b, v2.4b[3]\n"
+      "tbnz %x[flags], #31, 73f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "73:"  // Height 3: Multiply loop: unique 10: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "74:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 81f\n"
+      "cmp x27, #0x4\n"
+      "blt 77f\n"
+      "75:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "tbnz %x[flags], #31, 76f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "76:"  // Height 3: Multiply loop: unique 11: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x6f80e110  // udot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      ".inst 0x6f81e114  // udot v20.4s, v8.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x20]\n"
+      ".inst 0x6f82e118  // udot v24.4s, v8.16b, v2.4b[0]\n"
+      "ldr q4, [x11, #0x30]\n"
+      "sub x27, x27, #0x4\n"
+      ".inst 0x6f80e131  // udot v17.4s, v9.16b, v0.4b[0]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6f81e135  // udot v21.4s, v9.16b, v1.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f82e139  // udot v25.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x6f80e152  // udot v18.4s, v10.16b, v0.4b[0]\n"
+      ".inst 0x6f81e156  // udot v22.4s, v10.16b, v1.4b[0]\n"
+      ".inst 0x6f82e15a  // udot v26.4s, v10.16b, v2.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6f82e09b  // udot v27.4s, v4.16b, v2.4b[0]\n"
+      "bge 75b\n"
+      "cbz x27, 81f\n"
+      "77:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 78f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "tbz x27, #0, 79f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "b 79f\n"
+      "78:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "79:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 80f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "80:"  // Height 3: Multiply loop: unique 12: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x6f80e0b0  // udot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x6f81e0b4  // udot v20.4s, v5.16b, v1.4b[0]\n"
+      "ldr q7, [x11, #0x20]\n"
+      ".inst 0x6f82e0b8  // udot v24.4s, v5.16b, v2.4b[0]\n"
+      "ldr q8, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d9  // udot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f80e0f2  // udot v18.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0f6  // udot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0fa  // udot v26.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f80e113  // udot v19.4s, v8.16b, v0.4b[0]\n"
+      ".inst 0x6f81e117  // udot v23.4s, v8.16b, v1.4b[0]\n"
+      ".inst 0x6f82e11b  // udot v27.4s, v8.16b, v2.4b[0]\n"
+      "81:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 67b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbnz %x[flags], #31, 82f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v3.4s }, [x19]\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "neg v3.4s, v3.4s\n"
+      "mul v11.4s, v11.4s, v3.4s\n"
+      "mul v12.4s, v12.4s, v3.4s\n"
+      "mul v13.4s, v13.4s, v3.4s\n"
+      "82:"  // Height 3: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "add v20.4s, v20.4s, v12.4s\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v22.4s, v22.4s, v12.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add v23.4s, v23.4s, v12.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v24.4s, v24.4s, v13.4s\n"
+      "add v25.4s, v25.4s, v13.4s\n"
+      "add v26.4s, v26.4s, v13.4s\n"
+      "add v27.4s, v27.4s, v13.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "tbz %x[flags], #5, 83f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "and v8.16b, v20.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v9.16b, v21.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v8.4s\n"
+      "sqadd v21.4s, v21.4s, v9.4s\n"
+      "sqadd v22.4s, v22.4s, v10.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v7.16b, v26.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v8.16b, v27.16b, v0.16b\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v26.4s, v26.4s, v7.4s\n"
+      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "83:"  // Height 3: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "bge 92f\n"
+      "tbz x12, #3, 87f\n"
+      "str d16, [x9], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x12, #2, 85f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "tbz x12, #1, 84f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "st1 { v24.h }[6], [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "st1 { v24.b }[14], [x23]\n"
+      "b 91f\n"
+      "84:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "st1 { v24.b }[12], [x23]\n"
+      "b 91f\n"
+      "85:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 86f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "st1 { v24.h }[4], [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "st1 { v24.b }[10], [x23]\n"
+      "b 91f\n"
+      "86:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "st1 { v24.b }[8], [x23]\n"
+      "b 91f\n"
+      "87:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 89f\n"
+      "str s16, [x9], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "tbz x12, #1, 88f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "st1 { v24.h }[2], [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "st1 { v24.b }[6], [x23]\n"
+      "b 91f\n"
+      "88:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "st1 { v24.b }[4], [x23]\n"
+      "b 91f\n"
+      "89:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 90f\n"
+      "str h16, [x9], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "str h24, [x23], #0x2\n"
+      "tbz x12, #0, 91f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "st1 { v24.b }[2], [x23]\n"
+      "b 91f\n"
+      "90:"  // Height 3: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "str b24, [x23, #0x0]\n"
+      "91:"  // Height 3: Partial direct writeback: Done
+      "b 93f\n"
+      "92:"  // Height 3: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q24, [x23, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "93:"  // Height 3: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 65b\n"
+      "b 126f\n"
+      "94:"  // Height 4
+      "movi v11.4s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.16b, #0x1\n"
+      "tbz %x[flags], #2, 95f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "ldr x21, [%x[output_ptr], #0x18]\n"
+      "add x25, x25, x19\n"
+      "add %x[output_ptr], %x[output_ptr], #0x20\n"
+      "add x23, x23, x19\n"
+      "add x21, x21, x19\n"
+      "b 96f\n"
+      "95:"  // Height 4: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "add x21, x23, x19\n"
+      "add %x[output_ptr], x21, x19\n"
+      "96:"  // Height 4: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "97:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "98:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 99f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x28, 100f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 100f\n"
+      "99:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "100:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "blt 105f\n"
+      "cmp x27, #0x20\n"
+      "blt 103f\n"
+      "101:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x20, #0x0]\n"
+      "ldr q4, [x11, #0x0]\n"
+      ".inst 0x6f80e090  // udot v16.4s, v4.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x10]\n"
+      ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x20]\n"
+      ".inst 0x6f82e098  // udot v24.4s, v4.16b, v2.4b[0]\n"
+      "ldr q7, [x11, #0x30]\n"
+      ".inst 0x6f83e09c  // udot v28.4s, v4.16b, v3.4b[0]\n"
+      "ldr q8, [x11, #0x40]\n"
+      "ldr q9, [x11, #0x50]\n"
+      ".inst 0x6f80e0b1  // udot v17.4s, v5.16b, v0.4b[0]\n"
+      "ldr q10, [x11, #0x60]\n"
+      ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
+      "ldr q4, [x11, #0x70]\n"
+      ".inst 0x6f82e0b9  // udot v25.4s, v5.16b, v2.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f83e0bd  // udot v29.4s, v5.16b, v3.4b[0]\n"
+      "ldr q5, [x11, #0x80]\n"
+      ".inst 0x6f80e0d2  // udot v18.4s, v6.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6f82e0da  // udot v26.4s, v6.16b, v2.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6f83e0de  // udot v30.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x11, #0x90]\n"
+      ".inst 0x6f80e0f3  // udot v19.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0f7  // udot v23.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0fb  // udot v27.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0ff  // udot v31.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x11, #0xa0]\n"
+      ".inst 0x6fa0e110  // udot v16.4s, v8.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e114  // udot v20.4s, v8.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e118  // udot v24.4s, v8.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e11c  // udot v28.4s, v8.16b, v3.4b[1]\n"
+      "ldr q8, [x11, #0xb0]\n"
+      ".inst 0x6fa0e131  // udot v17.4s, v9.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e135  // udot v21.4s, v9.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e139  // udot v25.4s, v9.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e13d  // udot v29.4s, v9.16b, v3.4b[1]\n"
+      "ldr q9, [x11, #0xc0]\n"
+      ".inst 0x6fa0e152  // udot v18.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e156  // udot v22.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e15a  // udot v26.4s, v10.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e15e  // udot v30.4s, v10.16b, v3.4b[1]\n"
+      "ldr q10, [x11, #0xd0]\n"
+      ".inst 0x6fa0e093  // udot v19.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e09b  // udot v27.4s, v4.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e09f  // udot v31.4s, v4.16b, v3.4b[1]\n"
+      "ldr q4, [x11, #0xe0]\n"
+      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8b4  // udot v20.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8b8  // udot v24.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8bc  // udot v28.4s, v5.16b, v3.4b[2]\n"
+      "ldr q5, [x11, #0xf0]\n"
+      ".inst 0x6f80e8d1  // udot v17.4s, v6.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6f81e8d5  // udot v21.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d9  // udot v25.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8dd  // udot v29.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f80e8f2  // udot v18.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8f6  // udot v22.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8fa  // udot v26.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8fe  // udot v30.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f80e913  // udot v19.4s, v8.16b, v0.4b[2]\n"
+      ".inst 0x6f81e917  // udot v23.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x6f82e91b  // udot v27.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x6f83e91f  // udot v31.4s, v8.16b, v3.4b[2]\n"
+      ".inst 0x6fa0e930  // udot v16.4s, v9.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e934  // udot v20.4s, v9.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e938  // udot v24.4s, v9.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e93c  // udot v28.4s, v9.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e951  // udot v17.4s, v10.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e955  // udot v21.4s, v10.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e959  // udot v25.4s, v10.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e95d  // udot v29.4s, v10.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e896  // udot v22.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e89a  // udot v26.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e89e  // udot v30.4s, v4.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8b7  // udot v23.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8bb  // udot v27.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8bf  // udot v31.4s, v5.16b, v3.4b[3]\n"
+      "tbnz %x[flags], #31, 102f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
+      "102:"  // Height 4: Multiply loop: unique 13: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x20\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bge 101b\n"
+      "103:"  // Height 4: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q0, [x26, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x20, #0x0]\n"
+      "ldr q6, [x11, #0x0]\n"
+      ".inst 0x6f80e0d0  // udot v16.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x11, #0x10]\n"
+      ".inst 0x6f81e0d4  // udot v20.4s, v6.16b, v1.4b[0]\n"
+      "ldr q8, [x11, #0x20]\n"
+      ".inst 0x6f82e0d8  // udot v24.4s, v6.16b, v2.4b[0]\n"
+      "ldr q9, [x11, #0x30]\n"
+      ".inst 0x6f83e0dc  // udot v28.4s, v6.16b, v3.4b[0]\n"
+      "ldr q10, [x11, #0x40]\n"
+      "ldr q4, [x11, #0x50]\n"
+      ".inst 0x6f80e0f1  // udot v17.4s, v7.16b, v0.4b[0]\n"
+      "ldr q5, [x11, #0x60]\n"
+      ".inst 0x6f81e0f5  // udot v21.4s, v7.16b, v1.4b[0]\n"
+      "ldr q6, [x11, #0x70]\n"
+      ".inst 0x6f82e0f9  // udot v25.4s, v7.16b, v2.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f83e0fd  // udot v29.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x11, #0x80]\n"
+      ".inst 0x6f80e112  // udot v18.4s, v8.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f81e116  // udot v22.4s, v8.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6f82e11a  // udot v26.4s, v8.16b, v2.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6f83e11e  // udot v30.4s, v8.16b, v3.4b[0]\n"
+      "ldr q8, [x11, #0x90]\n"
+      ".inst 0x6f80e133  // udot v19.4s, v9.16b, v0.4b[0]\n"
+      ".inst 0x6f81e137  // udot v23.4s, v9.16b, v1.4b[0]\n"
+      ".inst 0x6f82e13b  // udot v27.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x6f83e13f  // udot v31.4s, v9.16b, v3.4b[0]\n"
+      "ldr q9, [x11, #0xa0]\n"
+      ".inst 0x6fa0e150  // udot v16.4s, v10.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e154  // udot v20.4s, v10.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e158  // udot v24.4s, v10.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e15c  // udot v28.4s, v10.16b, v3.4b[1]\n"
+      "ldr q10, [x11, #0xb0]\n"
+      ".inst 0x6fa0e091  // udot v17.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e095  // udot v21.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e099  // udot v25.4s, v4.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e09d  // udot v29.4s, v4.16b, v3.4b[1]\n"
+      "ldr q4, [x11, #0xc0]\n"
+      ".inst 0x6fa0e0b2  // udot v18.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0b6  // udot v22.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0ba  // udot v26.4s, v5.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0be  // udot v30.4s, v5.16b, v3.4b[1]\n"
+      "ldr q5, [x11, #0xd0]\n"
+      ".inst 0x6fa0e0d3  // udot v19.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0d7  // udot v23.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0db  // udot v27.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0df  // udot v31.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x11, #0xe0]\n"
+      ".inst 0x6f80e8f0  // udot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8f4  // udot v20.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f8  // udot v24.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8fc  // udot v28.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x11, #0xf0]\n"
+      ".inst 0x6f80e911  // udot v17.4s, v8.16b, v0.4b[2]\n"
+      "add x11, x11, #0x100\n"
+      ".inst 0x6f81e915  // udot v21.4s, v8.16b, v1.4b[2]\n"
+      ".inst 0x6f82e919  // udot v25.4s, v8.16b, v2.4b[2]\n"
+      ".inst 0x6f83e91d  // udot v29.4s, v8.16b, v3.4b[2]\n"
+      ".inst 0x6f80e932  // udot v18.4s, v9.16b, v0.4b[2]\n"
+      ".inst 0x6f81e936  // udot v22.4s, v9.16b, v1.4b[2]\n"
+      ".inst 0x6f82e93a  // udot v26.4s, v9.16b, v2.4b[2]\n"
+      ".inst 0x6f83e93e  // udot v30.4s, v9.16b, v3.4b[2]\n"
+      ".inst 0x6f80e953  // udot v19.4s, v10.16b, v0.4b[2]\n"
+      ".inst 0x6f81e957  // udot v23.4s, v10.16b, v1.4b[2]\n"
+      ".inst 0x6f82e95b  // udot v27.4s, v10.16b, v2.4b[2]\n"
+      ".inst 0x6f83e95f  // udot v31.4s, v10.16b, v3.4b[2]\n"
+      ".inst 0x6fa0e890  // udot v16.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e894  // udot v20.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e898  // udot v24.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e89c  // udot v28.4s, v4.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e8b1  // udot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8b5  // udot v21.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8b9  // udot v25.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8bd  // udot v29.4s, v5.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e8d2  // udot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8d6  // udot v22.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8da  // udot v26.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8de  // udot v30.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e8f3  // udot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8f7  // udot v23.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8fb  // udot v27.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8ff  // udot v31.4s, v7.16b, v3.4b[3]\n"
+      "tbnz %x[flags], #31, 104f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
+      "104:"  // Height 4: Multiply loop: unique 14: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "105:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 112f\n"
+      "cmp x27, #0x4\n"
+      "blt 108f\n"
+      "106:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x26], #0x4\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "ldr s3, [x20], #0x4\n"
+      "tbnz %x[flags], #31, 107f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
+      "107:"  // Height 4: Multiply loop: unique 15: skip row sum
+      "ldr q8, [x11, #0x0]\n"
+      ".inst 0x6f80e110  // udot v16.4s, v8.16b, v0.4b[0]\n"
+      "ldr q9, [x11, #0x10]\n"
+      ".inst 0x6f81e114  // udot v20.4s, v8.16b, v1.4b[0]\n"
+      "ldr q10, [x11, #0x20]\n"
+      ".inst 0x6f82e118  // udot v24.4s, v8.16b, v2.4b[0]\n"
+      "ldr q4, [x11, #0x30]\n"
+      ".inst 0x6f83e11c  // udot v28.4s, v8.16b, v3.4b[0]\n"
+      "sub x27, x27, #0x4\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f80e131  // udot v17.4s, v9.16b, v0.4b[0]\n"
+      "cmp x27, #0x4\n"
+      ".inst 0x6f81e135  // udot v21.4s, v9.16b, v1.4b[0]\n"
+      ".inst 0x6f82e139  // udot v25.4s, v9.16b, v2.4b[0]\n"
+      ".inst 0x6f83e13d  // udot v29.4s, v9.16b, v3.4b[0]\n"
+      ".inst 0x6f80e152  // udot v18.4s, v10.16b, v0.4b[0]\n"
+      ".inst 0x6f81e156  // udot v22.4s, v10.16b, v1.4b[0]\n"
+      ".inst 0x6f82e15a  // udot v26.4s, v10.16b, v2.4b[0]\n"
+      ".inst 0x6f83e15e  // udot v30.4s, v10.16b, v3.4b[0]\n"
+      ".inst 0x6f80e093  // udot v19.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6f81e097  // udot v23.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6f82e09b  // udot v27.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x6f83e09f  // udot v31.4s, v4.16b, v3.4b[0]\n"
+      "bge 106b\n"
+      "cbz x27, 112f\n"
+      "108:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x27, #1, 109f\n"
+      "ldr h0, [x26], #0x2\n"
+      "ldr h1, [x24], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "ldr h3, [x20], #0x2\n"
+      "tbz x27, #0, 110f\n"
+      "ld1 { v0.b }[2], [x26]\n"
+      "ld1 { v1.b }[2], [x24]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "ld1 { v3.b }[2], [x20]\n"
+      "b 110f\n"
+      "109:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x26, #0x0]\n"
+      "ldr b1, [x24, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "ldr b3, [x20, #0x0]\n"
+      "110:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "tbnz %x[flags], #31, 111f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942c  // udot v12.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f946e  // udot v14.4s, v3.16b, v15.16b\n"
+      "111:"  // Height 4: Multiply loop: unique 16: skip row sum
+      "ldr q5, [x11, #0x0]\n"
+      ".inst 0x6f80e0b0  // udot v16.4s, v5.16b, v0.4b[0]\n"
+      "ldr q6, [x11, #0x10]\n"
+      ".inst 0x6f81e0b4  // udot v20.4s, v5.16b, v1.4b[0]\n"
+      "ldr q7, [x11, #0x20]\n"
+      ".inst 0x6f82e0b8  // udot v24.4s, v5.16b, v2.4b[0]\n"
+      "ldr q8, [x11, #0x30]\n"
+      ".inst 0x6f83e0bc  // udot v28.4s, v5.16b, v3.4b[0]\n"
+      "add x11, x11, #0x40\n"
+      ".inst 0x6f80e0d1  // udot v17.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d9  // udot v25.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0dd  // udot v29.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0f2  // udot v18.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0f6  // udot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0fa  // udot v26.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0fe  // udot v30.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f80e113  // udot v19.4s, v8.16b, v0.4b[0]\n"
+      ".inst 0x6f81e117  // udot v23.4s, v8.16b, v1.4b[0]\n"
+      ".inst 0x6f82e11b  // udot v27.4s, v8.16b, v2.4b[0]\n"
+      ".inst 0x6f83e11f  // udot v31.4s, v8.16b, v3.4b[0]\n"
+      "112:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x28, x28, #0x1\n"
+      "cmp x28, x19\n"
+      "bne 98b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbnz %x[flags], #31, 113f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "addp v14.4s, v14.4s, v14.4s\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "addp v14.4s, v14.4s, v14.4s\n"
+      "neg v4.4s, v4.4s\n"
+      "mul v11.4s, v11.4s, v4.4s\n"
+      "mul v12.4s, v12.4s, v4.4s\n"
+      "mul v13.4s, v13.4s, v4.4s\n"
+      "mul v14.4s, v14.4s, v4.4s\n"
+      "113:"  // Height 4: skip row sum fixup
+      "add v16.4s, v16.4s, v11.4s\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q0, [x10, #0x0]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q1, [x10, #0x10]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q2, [x10, #0x20]\n"
+      "add v20.4s, v20.4s, v12.4s\n"
+      "ldr q3, [x10, #0x30]\n"
+      "add v21.4s, v21.4s, v12.4s\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add v22.4s, v22.4s, v12.4s\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "add v23.4s, v23.4s, v12.4s\n"
+      "add x10, x10, #0x40\n"
+      "add v24.4s, v24.4s, v13.4s\n"
+      "add v25.4s, v25.4s, v13.4s\n"
+      "add v26.4s, v26.4s, v13.4s\n"
+      "add v27.4s, v27.4s, v13.4s\n"
+      "add v28.4s, v28.4s, v14.4s\n"
+      "add v29.4s, v29.4s, v14.4s\n"
+      "add v30.4s, v30.4s, v14.4s\n"
+      "add v31.4s, v31.4s, v14.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v20.4s, v20.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v1.4s\n"
+      "add v22.4s, v22.4s, v2.4s\n"
+      "add v23.4s, v23.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "add v28.4s, v28.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x20]\n"
+      "add v29.4s, v29.4s, v1.4s\n"
+      "add v30.4s, v30.4s, v2.4s\n"
+      "add v31.4s, v31.4s, v3.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "tbz %x[flags], #5, 114f\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "and v8.16b, v20.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "and v9.16b, v21.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v20.4s, v20.4s, v8.4s\n"
+      "sqadd v21.4s, v21.4s, v9.4s\n"
+      "sqadd v22.4s, v22.4s, v10.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v7.16b, v26.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "and v8.16b, v27.16b, v0.16b\n"
+      "and v9.16b, v28.16b, v0.16b\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "and v10.16b, v29.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "and v4.16b, v30.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sqadd v26.4s, v26.4s, v7.4s\n"
+      "and v5.16b, v31.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v9.4s\n"
+      "sqadd v29.4s, v29.4s, v10.4s\n"
+      "sqadd v30.4s, v30.4s, v4.4s\n"
+      "sqadd v31.4s, v31.4s, v5.4s\n"
+      "114:"  // Height 4: no shift correction
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x19]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x19, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x19]\n"
+      "cmp x12, #0x10\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "srshl v29.4s, v29.4s, v0.4s\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "srshl v30.4s, v30.4s, v0.4s\n"
+      "smin v28.4s, v28.4s, v6.4s\n"
+      "smin v29.4s, v29.4s, v6.4s\n"
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "smax v28.4s, v28.4s, v5.4s\n"
+      "smax v29.4s, v29.4s, v5.4s\n"
+      "add v30.4s, v30.4s, v4.4s\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v30.4s, v30.4s, v6.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smax v30.4s, v30.4s, v5.4s\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "uzp1 v21.8h, v22.8h, v23.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v28.8h, v28.8h, v29.8h\n"
+      "uzp1 v29.8h, v30.8h, v31.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v20.16b, v20.16b, v21.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "uzp1 v28.16b, v28.16b, v29.16b\n"
+      "bge 123f\n"
+      "tbz x12, #3, 118f\n"
+      "str d16, [x9], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x12, #2, 116f\n"
+      "st1 { v16.s }[2], [x9], #0x4\n"
+      "st1 { v20.s }[2], [x25], #0x4\n"
+      "st1 { v24.s }[2], [x23], #0x4\n"
+      "st1 { v28.s }[2], [x21], #0x4\n"
+      "tbz x12, #1, 115f\n"
+      "st1 { v16.h }[6], [x9], #0x2\n"
+      "st1 { v20.h }[6], [x25], #0x2\n"
+      "st1 { v24.h }[6], [x23], #0x2\n"
+      "st1 { v28.h }[6], [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[14], [x9]\n"
+      "st1 { v20.b }[14], [x25]\n"
+      "st1 { v24.b }[14], [x23]\n"
+      "st1 { v28.b }[14], [x21]\n"
+      "b 122f\n"
+      "115:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[12], [x9]\n"
+      "st1 { v20.b }[12], [x25]\n"
+      "st1 { v24.b }[12], [x23]\n"
+      "st1 { v28.b }[12], [x21]\n"
+      "b 122f\n"
+      "116:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x12, #1, 117f\n"
+      "st1 { v16.h }[4], [x9], #0x2\n"
+      "st1 { v20.h }[4], [x25], #0x2\n"
+      "st1 { v24.h }[4], [x23], #0x2\n"
+      "st1 { v28.h }[4], [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[10], [x9]\n"
+      "st1 { v20.b }[10], [x25]\n"
+      "st1 { v24.b }[10], [x23]\n"
+      "st1 { v28.b }[10], [x21]\n"
+      "b 122f\n"
+      "117:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[8], [x9]\n"
+      "st1 { v20.b }[8], [x25]\n"
+      "st1 { v24.b }[8], [x23]\n"
+      "st1 { v28.b }[8], [x21]\n"
+      "b 122f\n"
+      "118:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x12, #2, 120f\n"
+      "str s16, [x9], #0x4\n"
+      "str s20, [x25], #0x4\n"
+      "str s24, [x23], #0x4\n"
+      "str s28, [x21], #0x4\n"
+      "tbz x12, #1, 119f\n"
+      "st1 { v16.h }[2], [x9], #0x2\n"
+      "st1 { v20.h }[2], [x25], #0x2\n"
+      "st1 { v24.h }[2], [x23], #0x2\n"
+      "st1 { v28.h }[2], [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[6], [x9]\n"
+      "st1 { v20.b }[6], [x25]\n"
+      "st1 { v24.b }[6], [x23]\n"
+      "st1 { v28.b }[6], [x21]\n"
+      "b 122f\n"
+      "119:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[4], [x9]\n"
+      "st1 { v20.b }[4], [x25]\n"
+      "st1 { v24.b }[4], [x23]\n"
+      "st1 { v28.b }[4], [x21]\n"
+      "b 122f\n"
+      "120:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x12, #1, 121f\n"
+      "str h16, [x9], #0x2\n"
+      "str h20, [x25], #0x2\n"
+      "str h24, [x23], #0x2\n"
+      "str h28, [x21], #0x2\n"
+      "tbz x12, #0, 122f\n"
+      "st1 { v16.b }[2], [x9]\n"
+      "st1 { v20.b }[2], [x25]\n"
+      "st1 { v24.b }[2], [x23]\n"
+      "st1 { v28.b }[2], [x21]\n"
+      "b 122f\n"
+      "121:"  // Height 4: Partial direct writeback: partial_1_0
+      "str b16, [x9, #0x0]\n"
+      "str b20, [x25, #0x0]\n"
+      "str b24, [x23, #0x0]\n"
+      "str b28, [x21, #0x0]\n"
+      "122:"  // Height 4: Partial direct writeback: Done
+      "b 124f\n"
+      "123:"  // Height 4: Full writeback
+      "str q16, [x9, #0x0]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q28, [x21, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x21, x21, #0x10\n"
+      "124:"  // Height 4: Writeback done
+      "subs x12, x12, #0x10\n"
+      "bgt 96b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 126f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 125f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "125:"  // Update direct input
+      "mov x19, #0x4\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "126:"  // Exit
+
+      : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
deleted file mode 100644
index 735e5fd45a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
+++ /dev/null
@@ -1,2434 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long blocks_count = K / 4;
-    const long odds_count = K - (blocks_count * 4);
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const uint8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(uint8_t);
-
-        uint32_t *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            long odds = odds_count;
-            const uint8_t *a_ptr0 = a_ptr0_base;
-            const uint8_t *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            uint32_t result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
-            uint32_t *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "temploadreg0 .req X0\n"
-                        "temploadreg1 .req X1\n"
-                        "temploadreg2 .req X2\n"
-                        "temploadreg3 .req X3\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "temploadreg0 .req X2\n"
-                        "temploadreg1 .req X3\n"
-                        "temploadreg2 .req X4\n"
-                        "temploadreg3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "temploadreg0 .req X4\n"
-                        "temploadreg1 .req X5\n"
-                        "temploadreg2 .req X6\n"
-                        "temploadreg3 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "temploadreg0 .req X6\n"
-                        "temploadreg1 .req X7\n"
-                        "temploadreg2 .req X8\n"
-                        "temploadreg3 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v28.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr d3, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[0], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[1], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "ld1 {v3.b}[2], [a_ptr3]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
deleted file mode 100644
index 2e86233a06..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
+++ /dev/null
@@ -1,1808 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long blocks_count = K / 4;
-    const long odds_count = K - (blocks_count * 4);
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const uint8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(uint8_t);
-
-        uint32_t *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            long odds = odds_count;
-            const uint8_t *a_ptr0 = a_ptr0_base;
-            const uint8_t *b_ptr0 = B + (K_stride * x0);
-            const bool use_result_buffer = (width < 16);
-            uint32_t result_buffer[64];
-            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
-            uint32_t *c_ptr_real = c_ptr0;
-            if (use_result_buffer && accumulate) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
-                    }
-                }
-            }
-            if (use_result_buffer) {
-                c_ptr0 = result_buffer;
-            }
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v27.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v28.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "cbz %[blocks], 6f\n"
-                        "7:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "b.ne 7b\n"
-                        "6:\n"
-                        "cbz %[odds], 8f\n"
-                        "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[0], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[0], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[0], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
-                        "ld1 {v1.b}[1], [a_ptr1], #1\n"
-                        "ld1 {v2.b}[1], [a_ptr2], #1\n"
-                        "ld1 {v3.b}[1], [a_ptr3], #1\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "ld1 {v0.b}[2], [%[a_ptr0]]\n"
-                        "ld1 {v1.b}[2], [a_ptr1]\n"
-                        "ld1 {v2.b}[2], [a_ptr2]\n"
-                        "ld1 {v3.b}[2], [a_ptr3]\n"
-                        "9:\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "8:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-            if (use_result_buffer) {
-                for(int cy=0; cy<std::min(M-y, 4); cy++) {
-                    for(unsigned int cx=0; cx<width; cx++) {
-                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
-                    }
-                }
-            }
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
new file mode 100644
index 0000000000..238c1825f3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<uint8_t>, \
+   size_t, size_t, \
+   const uint8_t *, \
+   IndirectOutputArg<uint32_t>, \
+   const uint32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_u8u32_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_u8u32_dot_6x16
+{
+public:
+    typedef uint8_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_u8u32_dot_6x16;
+
+    cls_a64_hybrid_u8u32_dot_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..3c8654147a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
@@ -0,0 +1,3335 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_dot_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+    const uint32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 176f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 141f\n"
+      "beq 106f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 71f\n"
+      "beq 36f\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "tbz %x[flags], #0, 13f\n"
+      "cmp x15, #0x10\n"
+      "bge 12f\n"
+      "tbz x15, #3, 7f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x15, #2, 5f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 4f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 11f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 11f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 11f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x15, #1, 6f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 11f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 11f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 11f\n"
+      "7:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x15, #2, 9f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 8f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 11f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 11f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "b 11f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x15, #1, 10f\n"
+      "ldr d8, [x13], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 11f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "b 11f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "11:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 14f\n"
+      "12:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "b 14f\n"
+      "13:"  // Height 1: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "14:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "15:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 16f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 17f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "b 17f\n"
+      "16:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "17:"  // Height 1: input setup done
+      "cmp x11, #0x10\n"
+      "blt 20f\n"
+      "cmp x11, #0x20\n"
+      "blt 19f\n"
+      "18:"  // Height 1: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      "bge 18b\n"
+      "19:"  // Height 1: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      "20:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 25f\n"
+      "cmp x11, #0x4\n"
+      "blt 22f\n"
+      "21:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "cmp x11, #0x4\n"
+      "bge 21b\n"
+      "cbz x11, 25f\n"
+      "22:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 23f\n"
+      "ldr h0, [x10], #0x2\n"
+      "tbz x11, #0, 24f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "b 24f\n"
+      "23:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      "25:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 15b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "bge 34f\n"
+      "tbz x15, #3, 29f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x15, #2, 27f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 26f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 33f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 33f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 33f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 28f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 33f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 33f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 33f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 31f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x15, #1, 30f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 33f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 33f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 33f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 32f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x15, #0, 33f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 33f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "33:"  // Height 1: Partial direct writeback: Done
+      "b 35f\n"
+      "34:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "35:"  // Height 1: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 3b\n"
+      "b 212f\n"
+      "36:"  // Height 2
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 37f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 38f\n"
+      "37:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "38:"  // Height 2: Column loop
+      "tbz %x[flags], #0, 48f\n"
+      "cmp x15, #0x10\n"
+      "bge 47f\n"
+      "tbz x15, #3, 42f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "tbz x15, #2, 40f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 39f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "b 46f\n"
+      "39:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 46f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "b 46f\n"
+      "40:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x15, #1, 41f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "b 46f\n"
+      "41:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 46f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "b 46f\n"
+      "42:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x15, #2, 44f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 43f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "b 46f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 46f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "b 46f\n"
+      "44:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x15, #1, 45f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 46f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "b 46f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "46:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "b 49f\n"
+      "47:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "b 49f\n"
+      "48:"  // Height 2: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "49:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "50:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 51f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "b 52f\n"
+      "51:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "52:"  // Height 2: input setup done
+      "cmp x11, #0x10\n"
+      "blt 55f\n"
+      "cmp x11, #0x20\n"
+      "blt 54f\n"
+      "53:"  // Height 2: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      "bge 53b\n"
+      "54:"  // Height 2: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      "55:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 60f\n"
+      "cmp x11, #0x4\n"
+      "blt 57f\n"
+      "56:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      "bge 56b\n"
+      "cbz x11, 60f\n"
+      "57:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 58f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "tbz x11, #0, 59f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "b 59f\n"
+      "58:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "59:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      "60:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 50b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "bge 69f\n"
+      "tbz x15, #3, 64f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "tbz x15, #2, 62f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 61f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "b 68f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 68f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "b 68f\n"
+      "62:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 63f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "b 68f\n"
+      "63:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 68f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "b 68f\n"
+      "64:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 66f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "tbz x15, #1, 65f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "b 68f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 68f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "b 68f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 67f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "tbz x15, #0, 68f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "b 68f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "68:"  // Height 2: Partial direct writeback: Done
+      "b 70f\n"
+      "69:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "70:"  // Height 2: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 38b\n"
+      "b 212f\n"
+      "71:"  // Height 3
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 73f\n"
+      "72:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "73:"  // Height 3: Column loop
+      "tbz %x[flags], #0, 83f\n"
+      "cmp x15, #0x10\n"
+      "bge 82f\n"
+      "tbz x15, #3, 77f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "tbz x15, #2, 75f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 74f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "b 81f\n"
+      "74:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 81f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "b 81f\n"
+      "75:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x15, #1, 76f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "b 81f\n"
+      "76:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 81f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "b 81f\n"
+      "77:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x15, #2, 79f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 78f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "b 81f\n"
+      "78:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 81f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "b 81f\n"
+      "79:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x15, #1, 80f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 81f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "b 81f\n"
+      "80:"  // Height 3: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "81:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "b 84f\n"
+      "82:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "b 84f\n"
+      "83:"  // Height 3: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "84:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "85:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 86f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 87f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "b 87f\n"
+      "86:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "87:"  // Height 3: input setup done
+      "cmp x11, #0x10\n"
+      "blt 90f\n"
+      "cmp x11, #0x20\n"
+      "blt 89f\n"
+      "88:"  // Height 3: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      "bge 88b\n"
+      "89:"  // Height 3: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      "90:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 95f\n"
+      "cmp x11, #0x4\n"
+      "blt 92f\n"
+      "91:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      "bge 91b\n"
+      "cbz x11, 95f\n"
+      "92:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 93f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "tbz x11, #0, 94f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "b 94f\n"
+      "93:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "94:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      "95:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 85b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "bge 104f\n"
+      "tbz x15, #3, 99f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "tbz x15, #2, 97f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 96f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "b 103f\n"
+      "96:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 103f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "b 103f\n"
+      "97:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 98f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "b 103f\n"
+      "98:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 103f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "b 103f\n"
+      "99:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 101f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "tbz x15, #1, 100f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "b 103f\n"
+      "100:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 103f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "b 103f\n"
+      "101:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 102f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "tbz x15, #0, 103f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "b 103f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "103:"  // Height 3: Partial direct writeback: Done
+      "b 105f\n"
+      "104:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "105:"  // Height 3: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 73b\n"
+      "b 212f\n"
+      "106:"  // Height 4
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 107f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 108f\n"
+      "107:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "108:"  // Height 4: Column loop
+      "tbz %x[flags], #0, 118f\n"
+      "cmp x15, #0x10\n"
+      "bge 117f\n"
+      "tbz x15, #3, 112f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "tbz x15, #2, 110f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 109f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "b 116f\n"
+      "109:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 116f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "b 116f\n"
+      "110:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x15, #1, 111f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "b 116f\n"
+      "111:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 116f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "b 116f\n"
+      "112:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x15, #2, 114f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 113f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "b 116f\n"
+      "113:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 116f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "b 116f\n"
+      "114:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x15, #1, 115f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 116f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "b 116f\n"
+      "115:"  // Height 4: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "116:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "b 119f\n"
+      "117:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "b 119f\n"
+      "118:"  // Height 4: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "119:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "120:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 121f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 122f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 122f\n"
+      "121:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "122:"  // Height 4: input setup done
+      "cmp x11, #0x10\n"
+      "blt 125f\n"
+      "cmp x11, #0x20\n"
+      "blt 124f\n"
+      "123:"  // Height 4: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      "bge 123b\n"
+      "124:"  // Height 4: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x26, x26, #0x10\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      "125:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 130f\n"
+      "cmp x11, #0x4\n"
+      "blt 127f\n"
+      "126:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      "bge 126b\n"
+      "cbz x11, 130f\n"
+      "127:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 128f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "tbz x11, #0, 129f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "b 129f\n"
+      "128:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "129:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      "130:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 120b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "bge 139f\n"
+      "tbz x15, #3, 134f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "tbz x15, #2, 132f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 131f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "b 138f\n"
+      "131:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 138f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "b 138f\n"
+      "132:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 133f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "b 138f\n"
+      "133:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 138f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "b 138f\n"
+      "134:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 136f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "tbz x15, #1, 135f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "b 138f\n"
+      "135:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 138f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "b 138f\n"
+      "136:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 137f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "tbz x15, #0, 138f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "b 138f\n"
+      "137:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "138:"  // Height 4: Partial direct writeback: Done
+      "b 140f\n"
+      "139:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "140:"  // Height 4: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 108b\n"
+      "b 212f\n"
+      "141:"  // Height 5
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 142f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 143f\n"
+      "142:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "143:"  // Height 5: Column loop
+      "tbz %x[flags], #0, 153f\n"
+      "cmp x15, #0x10\n"
+      "bge 152f\n"
+      "tbz x15, #3, 147f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "tbz x15, #2, 145f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 144f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "b 151f\n"
+      "144:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 151f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "b 151f\n"
+      "145:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x15, #1, 146f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "b 151f\n"
+      "146:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 151f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "b 151f\n"
+      "147:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x15, #2, 149f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 148f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "b 151f\n"
+      "148:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 151f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "b 151f\n"
+      "149:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x15, #1, 150f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 151f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "b 151f\n"
+      "150:"  // Height 5: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "151:"  // Height 5: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "b 154f\n"
+      "152:"  // Height 5: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "b 154f\n"
+      "153:"  // Height 5: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "154:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "155:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 156f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 157f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 157f\n"
+      "156:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "157:"  // Height 5: input setup done
+      "cmp x11, #0x10\n"
+      "blt 160f\n"
+      "cmp x11, #0x20\n"
+      "blt 159f\n"
+      "158:"  // Height 5: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      "bge 158b\n"
+      "159:"  // Height 5: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x24, x24, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      "160:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x11, 165f\n"
+      "cmp x11, #0x4\n"
+      "blt 162f\n"
+      "161:"  // Height 5: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      "bge 161b\n"
+      "cbz x11, 165f\n"
+      "162:"  // Height 5: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 163f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "tbz x11, #0, 164f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "b 164f\n"
+      "163:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "164:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      "165:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 155b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "bge 174f\n"
+      "tbz x15, #3, 169f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "tbz x15, #2, 167f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 166f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "b 173f\n"
+      "166:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 173f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "b 173f\n"
+      "167:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 168f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "b 173f\n"
+      "168:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 173f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "b 173f\n"
+      "169:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 171f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "tbz x15, #1, 170f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "b 173f\n"
+      "170:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 173f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "b 173f\n"
+      "171:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 172f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "tbz x15, #0, 173f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "b 173f\n"
+      "172:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "173:"  // Height 5: Partial direct writeback: Done
+      "b 175f\n"
+      "174:"  // Height 5: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "175:"  // Height 5: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 143b\n"
+      "b 212f\n"
+      "176:"  // Height 6
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 177f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 178f\n"
+      "177:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "178:"  // Height 6: Column loop
+      "tbz %x[flags], #0, 188f\n"
+      "cmp x15, #0x10\n"
+      "bge 187f\n"
+      "tbz x15, #3, 182f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v13.4s }, [x9], #0x10\n"
+      "ld1 { v17.4s }, [x27], #0x10\n"
+      "ld1 { v21.4s }, [x25], #0x10\n"
+      "ld1 { v25.4s }, [x23], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x15, #2, 180f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x9], #0x10\n"
+      "ld1 { v18.4s }, [x27], #0x10\n"
+      "ld1 { v22.4s }, [x25], #0x10\n"
+      "ld1 { v26.4s }, [x23], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 179f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d15, [x9], #0x8\n"
+      "ldr d19, [x27], #0x8\n"
+      "ldr d23, [x25], #0x8\n"
+      "ldr d27, [x23], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x9]\n"
+      "ld1 { v19.s }[2], [x27]\n"
+      "ld1 { v23.s }[2], [x25]\n"
+      "ld1 { v27.s }[2], [x23]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 186f\n"
+      "179:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x15, #0, 186f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s15, [x9, #0x0]\n"
+      "ldr s19, [x27, #0x0]\n"
+      "ldr s23, [x25, #0x0]\n"
+      "ldr s27, [x23, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 186f\n"
+      "180:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x15, #1, 181f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d14, [x9], #0x8\n"
+      "ldr d18, [x27], #0x8\n"
+      "ldr d22, [x25], #0x8\n"
+      "ldr d26, [x23], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x9]\n"
+      "ld1 { v18.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x25]\n"
+      "ld1 { v26.s }[2], [x23]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 186f\n"
+      "181:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x15, #0, 186f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s14, [x9, #0x0]\n"
+      "ldr s18, [x27, #0x0]\n"
+      "ldr s22, [x25, #0x0]\n"
+      "ldr s26, [x23, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 186f\n"
+      "182:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x15, #2, 184f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v12.4s }, [x9], #0x10\n"
+      "ld1 { v16.4s }, [x27], #0x10\n"
+      "ld1 { v20.4s }, [x25], #0x10\n"
+      "ld1 { v24.4s }, [x23], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 183f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d13, [x9], #0x8\n"
+      "ldr d17, [x27], #0x8\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d25, [x23], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v13.s }[2], [x9]\n"
+      "ld1 { v17.s }[2], [x27]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v25.s }[2], [x23]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 186f\n"
+      "183:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x15, #0, 186f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s13, [x9, #0x0]\n"
+      "ldr s17, [x27, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s25, [x23, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 186f\n"
+      "184:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x15, #1, 185f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d12, [x9], #0x8\n"
+      "ldr d16, [x27], #0x8\n"
+      "ldr d20, [x25], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x15, #0, 186f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v12.s }[2], [x9]\n"
+      "ld1 { v16.s }[2], [x27]\n"
+      "ld1 { v20.s }[2], [x25]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 186f\n"
+      "185:"  // Height 6: Partial accumulate: partial_1_0
+      "mov x19, #0x0\n"
+      "ldr s8, [x13, #0x0]\n"
+      "ldr s12, [x9, #0x0]\n"
+      "ldr s16, [x27, #0x0]\n"
+      "ldr s20, [x25, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "186:"  // Height 6: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "sub x9, x9, x19\n"
+      "sub x27, x27, x19\n"
+      "sub x25, x25, x19\n"
+      "sub x23, x23, x19\n"
+      "sub x21, x21, x19\n"
+      "b 189f\n"
+      "187:"  // Height 6: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x9, #0x0]\n"
+      "ldr q13, [x9, #0x10]\n"
+      "ldr q14, [x9, #0x20]\n"
+      "ldr q15, [x9, #0x30]\n"
+      "ldr q16, [x27, #0x0]\n"
+      "ldr q17, [x27, #0x10]\n"
+      "ldr q18, [x27, #0x20]\n"
+      "ldr q19, [x27, #0x30]\n"
+      "ldr q20, [x25, #0x0]\n"
+      "ldr q21, [x25, #0x10]\n"
+      "ldr q22, [x25, #0x20]\n"
+      "ldr q23, [x25, #0x30]\n"
+      "ldr q24, [x23, #0x0]\n"
+      "ldr q25, [x23, #0x10]\n"
+      "ldr q26, [x23, #0x20]\n"
+      "ldr q27, [x23, #0x30]\n"
+      "ldr q28, [x21, #0x0]\n"
+      "ldr q29, [x21, #0x10]\n"
+      "ldr q30, [x21, #0x20]\n"
+      "ldr q31, [x21, #0x30]\n"
+      "b 189f\n"
+      "188:"  // Height 6: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "189:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "190:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 191f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 192f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 192f\n"
+      "191:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "192:"  // Height 6: input setup done
+      "cmp x11, #0x10\n"
+      "blt 195f\n"
+      "cmp x11, #0x20\n"
+      "blt 194f\n"
+      "193:"  // Height 6: Multiply loop: Main loop head
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sub x11, x11, #0x10\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      "cmp x11, #0x20\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0dc  // udot v28.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0fd  // udot v29.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0de  // udot v30.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0ff  // udot v31.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8dc  // udot v28.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8fd  // udot v29.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8de  // udot v30.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8ff  // udot v31.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8dc  // udot v28.4s, v6.16b, v5.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8fd  // udot v29.4s, v7.16b, v5.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8de  // udot v30.4s, v6.16b, v5.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8ff  // udot v31.4s, v7.16b, v5.4b[3]\n"
+      "bge 193b\n"
+      "194:"  // Height 6: Multiply loop: Single iteration only
+      "sub x11, x11, #0x10\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x28, #0x0]\n"
+      "ldr q2, [x26, #0x0]\n"
+      "ldr q3, [x24, #0x0]\n"
+      "ldr q4, [x22, #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "add x10, x10, #0x10\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      "add x22, x22, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x40]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x50]\n"
+      ".inst 0x6fa0e0c8  // udot v8.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0cc  // udot v12.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d0  // udot v16.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d4  // udot v20.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0d8  // udot v24.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0dc  // udot v28.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x60]\n"
+      ".inst 0x6fa0e0e9  // udot v9.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ed  // udot v13.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f1  // udot v17.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f5  // udot v21.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0f9  // udot v25.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0fd  // udot v29.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x70]\n"
+      ".inst 0x6fa0e0ca  // udot v10.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ce  // udot v14.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0d2  // udot v18.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0d6  // udot v22.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0da  // udot v26.4s, v6.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0de  // udot v30.4s, v6.16b, v5.4b[1]\n"
+      "ldr q6, [x14, #0x80]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6fa1e0ef  // udot v15.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6fa2e0f3  // udot v19.4s, v7.16b, v2.4b[1]\n"
+      ".inst 0x6fa3e0f7  // udot v23.4s, v7.16b, v3.4b[1]\n"
+      ".inst 0x6fa4e0fb  // udot v27.4s, v7.16b, v4.4b[1]\n"
+      ".inst 0x6fa5e0ff  // udot v31.4s, v7.16b, v5.4b[1]\n"
+      "ldr q7, [x14, #0x90]\n"
+      ".inst 0x6f80e8c8  // udot v8.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8cc  // udot v12.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d4  // udot v20.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8d8  // udot v24.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8dc  // udot v28.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xa0]\n"
+      ".inst 0x6f80e8e9  // udot v9.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ed  // udot v13.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f1  // udot v17.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f5  // udot v21.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8f9  // udot v25.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8fd  // udot v29.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xb0]\n"
+      ".inst 0x6f80e8ca  // udot v10.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ce  // udot v14.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8d2  // udot v18.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8d6  // udot v22.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8da  // udot v26.4s, v6.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8de  // udot v30.4s, v6.16b, v5.4b[2]\n"
+      "ldr q6, [x14, #0xc0]\n"
+      ".inst 0x6f80e8eb  // udot v11.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6f81e8ef  // udot v15.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6f82e8f3  // udot v19.4s, v7.16b, v2.4b[2]\n"
+      ".inst 0x6f83e8f7  // udot v23.4s, v7.16b, v3.4b[2]\n"
+      ".inst 0x6f84e8fb  // udot v27.4s, v7.16b, v4.4b[2]\n"
+      ".inst 0x6f85e8ff  // udot v31.4s, v7.16b, v5.4b[2]\n"
+      "ldr q7, [x14, #0xd0]\n"
+      ".inst 0x6fa0e8c8  // udot v8.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8cc  // udot v12.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d0  // udot v16.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d4  // udot v20.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8d8  // udot v24.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8dc  // udot v28.4s, v6.16b, v5.4b[3]\n"
+      "ldr q6, [x14, #0xe0]\n"
+      ".inst 0x6fa0e8e9  // udot v9.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ed  // udot v13.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f1  // udot v17.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f5  // udot v21.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8f9  // udot v25.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8fd  // udot v29.4s, v7.16b, v5.4b[3]\n"
+      "ldr q7, [x14, #0xf0]\n"
+      ".inst 0x6fa0e8ca  // udot v10.4s, v6.16b, v0.4b[3]\n"
+      "add x14, x14, #0x100\n"
+      ".inst 0x6fa1e8ce  // udot v14.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8d2  // udot v18.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8d6  // udot v22.4s, v6.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8da  // udot v26.4s, v6.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8de  // udot v30.4s, v6.16b, v5.4b[3]\n"
+      ".inst 0x6fa0e8eb  // udot v11.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6fa1e8ef  // udot v15.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6fa2e8f3  // udot v19.4s, v7.16b, v2.4b[3]\n"
+      ".inst 0x6fa3e8f7  // udot v23.4s, v7.16b, v3.4b[3]\n"
+      ".inst 0x6fa4e8fb  // udot v27.4s, v7.16b, v4.4b[3]\n"
+      ".inst 0x6fa5e8ff  // udot v31.4s, v7.16b, v5.4b[3]\n"
+      "195:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x11, 200f\n"
+      "cmp x11, #0x4\n"
+      "blt 197f\n"
+      "196:"  // Height 6: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "ldr s1, [x28], #0x4\n"
+      "ldr s2, [x26], #0x4\n"
+      "ldr s3, [x24], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x20], #0x4\n"
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      "sub x11, x11, #0x4\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      "cmp x11, #0x4\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      "bge 196b\n"
+      "cbz x11, 200f\n"
+      "197:"  // Height 6: Multiply loop: Skip odd blocks
+      "tbz x11, #1, 198f\n"
+      "ldr h0, [x10], #0x2\n"
+      "ldr h1, [x28], #0x2\n"
+      "ldr h2, [x26], #0x2\n"
+      "ldr h3, [x24], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x20], #0x2\n"
+      "tbz x11, #0, 199f\n"
+      "ld1 { v0.b }[2], [x10]\n"
+      "ld1 { v1.b }[2], [x28]\n"
+      "ld1 { v2.b }[2], [x26]\n"
+      "ld1 { v3.b }[2], [x24]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x20]\n"
+      "b 199f\n"
+      "198:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b0, [x10, #0x0]\n"
+      "ldr b1, [x28, #0x0]\n"
+      "ldr b2, [x26, #0x0]\n"
+      "ldr b3, [x24, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x20, #0x0]\n"
+      "199:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "ldr q6, [x14, #0x0]\n"
+      ".inst 0x6f80e0c8  // udot v8.4s, v6.16b, v0.4b[0]\n"
+      "ldr q7, [x14, #0x10]\n"
+      ".inst 0x6f81e0cc  // udot v12.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d0  // udot v16.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d4  // udot v20.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0d8  // udot v24.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0dc  // udot v28.4s, v6.16b, v5.4b[0]\n"
+      "ldr q6, [x14, #0x20]\n"
+      ".inst 0x6f80e0e9  // udot v9.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ed  // udot v13.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f1  // udot v17.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f5  // udot v21.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0f9  // udot v25.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0fd  // udot v29.4s, v7.16b, v5.4b[0]\n"
+      "ldr q7, [x14, #0x30]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "add x14, x14, #0x40\n"
+      ".inst 0x6f81e0ce  // udot v14.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0d2  // udot v18.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0da  // udot v26.4s, v6.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0de  // udot v30.4s, v6.16b, v5.4b[0]\n"
+      ".inst 0x6f80e0eb  // udot v11.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6f81e0ef  // udot v15.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6f82e0f3  // udot v19.4s, v7.16b, v2.4b[0]\n"
+      ".inst 0x6f83e0f7  // udot v23.4s, v7.16b, v3.4b[0]\n"
+      ".inst 0x6f84e0fb  // udot v27.4s, v7.16b, v4.4b[0]\n"
+      ".inst 0x6f85e0ff  // udot v31.4s, v7.16b, v5.4b[0]\n"
+      "200:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 190b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "cmp x15, #0x10\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "bge 209f\n"
+      "tbz x15, #3, 204f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v13.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v17.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v21.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v25.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "tbz x15, #2, 202f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x9], #0x10\n"
+      "st1 { v18.4s }, [x27], #0x10\n"
+      "st1 { v22.4s }, [x25], #0x10\n"
+      "st1 { v26.4s }, [x23], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 201f\n"
+      "str d11, [x13], #0x8\n"
+      "str d15, [x9], #0x8\n"
+      "str d19, [x27], #0x8\n"
+      "str d23, [x25], #0x8\n"
+      "str d27, [x23], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x9]\n"
+      "st1 { v19.s }[2], [x27]\n"
+      "st1 { v23.s }[2], [x25]\n"
+      "st1 { v27.s }[2], [x23]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 208f\n"
+      "201:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x15, #0, 208f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s15, [x9, #0x0]\n"
+      "str s19, [x27, #0x0]\n"
+      "str s23, [x25, #0x0]\n"
+      "str s27, [x23, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "b 208f\n"
+      "202:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x15, #1, 203f\n"
+      "str d10, [x13], #0x8\n"
+      "str d14, [x9], #0x8\n"
+      "str d18, [x27], #0x8\n"
+      "str d22, [x25], #0x8\n"
+      "str d26, [x23], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x9]\n"
+      "st1 { v18.s }[2], [x27]\n"
+      "st1 { v22.s }[2], [x25]\n"
+      "st1 { v26.s }[2], [x23]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "b 208f\n"
+      "203:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x15, #0, 208f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s14, [x9, #0x0]\n"
+      "str s18, [x27, #0x0]\n"
+      "str s22, [x25, #0x0]\n"
+      "str s26, [x23, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "b 208f\n"
+      "204:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x15, #2, 206f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v12.4s }, [x9], #0x10\n"
+      "st1 { v16.4s }, [x27], #0x10\n"
+      "st1 { v20.4s }, [x25], #0x10\n"
+      "st1 { v24.4s }, [x23], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "tbz x15, #1, 205f\n"
+      "str d9, [x13], #0x8\n"
+      "str d13, [x9], #0x8\n"
+      "str d17, [x27], #0x8\n"
+      "str d21, [x25], #0x8\n"
+      "str d25, [x23], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v13.s }[2], [x9]\n"
+      "st1 { v17.s }[2], [x27]\n"
+      "st1 { v21.s }[2], [x25]\n"
+      "st1 { v25.s }[2], [x23]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "b 208f\n"
+      "205:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x15, #0, 208f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s13, [x9, #0x0]\n"
+      "str s17, [x27, #0x0]\n"
+      "str s21, [x25, #0x0]\n"
+      "str s25, [x23, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "b 208f\n"
+      "206:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x15, #1, 207f\n"
+      "str d8, [x13], #0x8\n"
+      "str d12, [x9], #0x8\n"
+      "str d16, [x27], #0x8\n"
+      "str d20, [x25], #0x8\n"
+      "str d24, [x23], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x15, #0, 208f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v12.s }[2], [x9]\n"
+      "st1 { v16.s }[2], [x27]\n"
+      "st1 { v20.s }[2], [x25]\n"
+      "st1 { v24.s }[2], [x23]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "b 208f\n"
+      "207:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s12, [x9, #0x0]\n"
+      "str s16, [x27, #0x0]\n"
+      "str s20, [x25, #0x0]\n"
+      "str s24, [x23, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "208:"  // Height 6: Partial direct writeback: Done
+      "b 210f\n"
+      "209:"  // Height 6: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x9, #0x0]\n"
+      "str q13, [x9, #0x10]\n"
+      "str q14, [x9, #0x20]\n"
+      "str q15, [x9, #0x30]\n"
+      "str q16, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q18, [x27, #0x20]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q20, [x25, #0x0]\n"
+      "str q21, [x25, #0x10]\n"
+      "str q22, [x25, #0x20]\n"
+      "str q23, [x25, #0x30]\n"
+      "str q24, [x23, #0x0]\n"
+      "str q25, [x23, #0x10]\n"
+      "str q26, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "str q28, [x21, #0x0]\n"
+      "str q29, [x21, #0x10]\n"
+      "str q30, [x21, #0x20]\n"
+      "str q31, [x21, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "add x9, x9, #0x40\n"
+      "add x27, x27, #0x40\n"
+      "add x25, x25, #0x40\n"
+      "add x23, x23, #0x40\n"
+      "add x21, x21, #0x40\n"
+      "210:"  // Height 6: Writeback done
+      "subs x15, x15, #0x10\n"
+      "bgt 178b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 212f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 211f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "211:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "212:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp
deleted file mode 100644
index 58a51432fd..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-
-namespace arm_gemm {
-
-void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const bfloat16 *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    K /= 2;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const bfloat16 *a_ptr0 = a_ptr;
-        const bfloat16 *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "movi v8.4s, #0\n"
-                "ldr q0, [%[a_ptr]]\n"
-                "movi v9.4s, #0\n"
-                "ldr q2, [%[b_ptr]]\n"
-                "movi v10.4s, #0\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                "movi v11.4s, #0\n"
-                "ldr q3, [%[b_ptr], #0x10]\n"
-                "movi v12.4s, #0\n"
-                "ldr q4, [%[b_ptr], #0x20]\n"
-                "movi v13.4s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                "movi v14.4s, #0\n"
-                "add %[b_ptr], %[b_ptr], #0x30\n"
-                "movi v15.4s, #0\n"
-                "movi v16.4s, #0\n"
-                "movi v17.4s, #0\n"
-                "movi v18.4s, #0\n"
-                "movi v19.4s, #0\n"
-                "movi v20.4s, #0\n"
-                "movi v21.4s, #0\n"
-                "movi v22.4s, #0\n"
-                "movi v23.4s, #0\n"
-                "movi v24.4s, #0\n"
-                "movi v25.4s, #0\n"
-                "movi v26.4s, #0\n"
-                "movi v27.4s, #0\n"
-                "movi v28.4s, #0\n"
-                "movi v29.4s, #0\n"
-                "movi v30.4s, #0\n"
-                "movi v31.4s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                "ldr q2, [%[b_ptr]]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                "ldr q3, [%[b_ptr], #0x10]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr], #0x20]\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                "add %[b_ptr], %[b_ptr], #0x60\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                "ldr q2, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                "ldr q3, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr], #-0x10]\n"
-                "ldr q1, [%[a_ptr], #-0x10]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                "ldr q2, [%[b_ptr]]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                "ldr q3, [%[b_ptr], #0x10]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr], #0x20]\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                "add %[b_ptr], %[b_ptr], #0x60\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                "ldr q2, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                "ldr q3, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                "ldr q1, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                "str q8, [%[c_ptr]]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                "str q12, [%[c_ptr], #0x10]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "str q16, [%[c_ptr], #0x20]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                "str q9, [%[c_ptr], #0x30]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                "add %[b_ptr], %[b_ptr], #0x30\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                "ldr q2, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                "ldr q3, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
-                "ldr q1, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
-                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
-                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
-                "str q8, [%[c_ptr]]\n"
-                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
-                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
-                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
-                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
-                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
-                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
-                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
-                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
-                "str q12, [%[c_ptr], #0x10]\n"
-                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
-                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
-                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
-                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
-                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
-                "str q16, [%[c_ptr], #0x20]\n"
-                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
-                "str q9, [%[c_ptr], #0x30]\n"
-                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
-                "4:\n"
-                "str q13, [%[c_ptr], #0x40]\n"
-                "str q17, [%[c_ptr], #0x50]\n"
-                "str q10, [%[c_ptr], #0x60]\n"
-                "str q14, [%[c_ptr], #0x70]\n"
-                "str q18, [%[c_ptr], #0x80]\n"
-                "str q11, [%[c_ptr], #0x90]\n"
-                "str q15, [%[c_ptr], #0xa0]\n"
-                "str q19, [%[c_ptr], #0xb0]\n"
-                "str q20, [%[c_ptr], #0xc0]\n"
-                "str q24, [%[c_ptr], #0xd0]\n"
-                "str q28, [%[c_ptr], #0xe0]\n"
-                "str q21, [%[c_ptr], #0xf0]\n"
-                "str q25, [%[c_ptr], #0x100]\n"
-                "str q29, [%[c_ptr], #0x110]\n"
-                "str q22, [%[c_ptr], #0x120]\n"
-                "str q26, [%[c_ptr], #0x130]\n"
-                "str q30, [%[c_ptr], #0x140]\n"
-                "str q23, [%[c_ptr], #0x150]\n"
-                "str q27, [%[c_ptr], #0x160]\n"
-                "str q31, [%[c_ptr], #0x170]\n"
-                "add %[c_ptr], %[c_ptr], #0x180\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
similarity index 81%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
index 95fed86c2f..2fea5ad2e7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
@@ -31,10 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
-void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int);
 
-class interleaved_bf16fp32_dot_12x8 {
+class cls_a64_interleaved_bf16fp32_dot_8x12 {
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
@@ -60,13 +59,11 @@ class interleaved_bf16fp32_dot_12x8 {
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12, 2> transforms = {};
 
-    kern_type kernel=a64_interleaved_bf16fp32_dot_12x8;
+    kern_type kernel=a64_interleaved_bf16fp32_dot_8x12;
 
-    interleaved_bf16fp32_dot_12x8(const CPUInfo *ci)
+    cls_a64_interleaved_bf16fp32_dot_8x12(const CPUInfo *)
     {
-        if (ci->get_cpu_model() == CPUModel::X1) {
-            kernel = a64_interleaved_bf16fp32_dot_12x8_x1;
-        }
+
     }
 };
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
index 7ffae524dc..92149a5579 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const bfloat16 *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
similarity index 89%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index 7fac59947e..b2c2407b28 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int);
 
-class interleaved_bf16fp32_mmla_12x8 {
+class cls_a64_interleaved_bf16fp32_mmla_8x12 {
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
@@ -59,9 +59,9 @@ class interleaved_bf16fp32_mmla_12x8 {
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
 
-    kern_type kernel=a64_interleaved_bf16fp32_mmla_12x8;
+    kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12;
 
-    interleaved_bf16fp32_mmla_12x8(const CPUInfo *)
+    cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
similarity index 96%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
index 7f0eff29af..c476fcf171 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const bfloat16 *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
@@ -87,13 +87,23 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *
                 "movi v27.4s, #0\n"
                 "prfm PLDL1KEEP, [%[b_ptr], #0x1c0]\n"
                 "movi v28.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n"
                 "movi v29.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n"
                 "movi v30.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x240]\n"
                 "movi v31.4s, #0\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x280]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x2c0]\n"
                 "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x380]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x3c0]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x400]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x440]\n"
                 "add %[a_ptr], %[a_ptr], #0x40\n"
                 "add %[b_ptr], %[b_ptr], #0x40\n"
                 "cbz %[loops], 1f\n"
@@ -105,19 +115,19 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *
                 ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
                 "subs %[loops], %[loops], #0x1\n"
                 ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x2c0]\n"
                 ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
                 "ldr q4, [%[b_ptr]]\n"
                 ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x440]\n"
                 ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x300]\n"
                 ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
                 "ldr q5, [%[b_ptr], #0x10]\n"
                 ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x480]\n"
                 ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x4c0]\n"
                 ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
                 ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
                 "ldr q6, [%[b_ptr], #0x20]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
similarity index 86%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index 7bfb2291a9..b17b76f170 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_interleaved_s8s32_mmla_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_interleaved_s8s32_mmla_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
-class interleaved_s8s32_mmla_12x8 {
+class cls_a64_interleaved_s8s32_mmla_8x12 {
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
@@ -58,10 +58,11 @@ class interleaved_s8s32_mmla_12x8 {
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
 
-    kern_type kernel=a64_interleaved_s8s32_mmla_12x8;
+    kern_type kernel=a64_interleaved_s8s32_mmla_8x12;
 
-    interleaved_s8s32_mmla_12x8(const CPUInfo *)
+    cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
index 7953510aa7..2093e75b8e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_s8s32_mmla_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
     const int8_t *a_ptr = Apanel;
     int32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
similarity index 86%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index d493517cf1..99dd0be0d9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_interleaved_u8u32_mmla_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_interleaved_u8u32_mmla_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
-class interleaved_u8u32_mmla_12x8 {
+class cls_a64_interleaved_u8u32_mmla_8x12 {
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
@@ -58,10 +58,11 @@ class interleaved_u8u32_mmla_12x8 {
 
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
 
-    kern_type kernel=a64_interleaved_u8u32_mmla_12x8;
+    kern_type kernel=a64_interleaved_u8u32_mmla_8x12;
 
-    interleaved_u8u32_mmla_12x8(const CPUInfo *)
+    cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
index dcd15f0345..568e5d1098 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
     const uint8_t *a_ptr = Apanel;
     uint32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
similarity index 79%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
index 981ce34b49..a3daf079d9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
@@ -30,13 +30,13 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_sgemm_asimd_12x8(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_a55(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_a55r1(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_x1(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_a53(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_a55(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_a55r1(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_x1(const float *, const float *, float *, int, int, int);
 
-// 12x8 SGEMM "strategy" class.
+// 8x12 SGEMM "strategy" class.
 //
 // This describes the characteristics of a family of kernels, in terms of
 // the required interleave properties and the output block size.
@@ -44,7 +44,7 @@ void a64_sgemm_asimd_12x8_x1(const float *, const float *, float *, int, int, in
 // All kernels in the family must share these characteristics.  The actual
 // kernel to be used can be chosen at runtime, based on the CPU_type
 // structure.
-class sgemm_12x8 {
+class cls_a64_sgemm_8x12 {
 public:
     typedef float operand_type;
     typedef float result_type;
@@ -70,7 +70,7 @@ class sgemm_12x8 {
     static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
         switch (ci->get_cpu_model()) {
             case CPUModel::A55r1:
-                return { 3.724, 1.416, 1.113 };
+                return { 3.954, 1.252, 1.141 };
 
             case CPUModel::A53:
                 return { 2.777, 0.987, 0.898 };
@@ -79,29 +79,29 @@ class sgemm_12x8 {
                 return { 2.885, 1.429, 1.163 };
 
             default:
-                return { 6.949, 4.149, 2.826 };
+                return { 7.2307, 3.876, 2.932 };
         }
     }
 
-    kern_type kernel=a64_sgemm_asimd_12x8;
+    kern_type kernel=a64_sgemm_asimd_8x12;
 
-    sgemm_12x8(const CPUInfo *ci) {
+    cls_a64_sgemm_8x12(const CPUInfo *ci) {
         // Select specific kernel if available
         switch(ci->get_cpu_model()) {
             case CPUModel::A53:
-                kernel = a64_sgemm_asimd_12x8_a53;
+                kernel = a64_sgemm_asimd_8x12_a53;
                 break;
 
             case CPUModel::A55r0:
-                kernel = a64_sgemm_asimd_12x8_a55;
+                kernel = a64_sgemm_asimd_8x12_a55;
                 break;
 
             case CPUModel::A55r1:
-                kernel = a64_sgemm_asimd_12x8_a55r1;
+                kernel = a64_sgemm_asimd_8x12_a55r1;
                 break;
 
             case CPUModel::X1:
-                kernel = a64_sgemm_asimd_12x8_x1;
+                kernel = a64_sgemm_asimd_8x12_x1;
                 break;
 
             default:
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp
index 5532485efb..f4b6e7b70f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp
index e9f071f7f4..5f86da8ef3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp
index 8a6fbacfad..7709ad1be6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp
@@ -29,7 +29,7 @@
 
 namespace arm_gemm {
 
-void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K) {
+void a64_sgemm_asimd_8x12_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp
index 48dc46785e..dc72095a9b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp
@@ -39,7 +39,7 @@
 
 namespace arm_gemm {
 
-void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp
index 63fdf4df9f..89f8ac2d6c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp
@@ -39,7 +39,7 @@
 
 namespace arm_gemm {
 
-void a64_sgemm_asimd_12x8_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp
similarity index 91%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp
index 6f31efe6cb..5f7252f019 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp
@@ -25,13 +25,15 @@
 
 #ifdef __aarch64__
 
+
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_smallK_hybrid_fp32_mla_4x6(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_smallK_hybrid_fp32_mla_6x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
 
-class smallK_hybrid_fp32_mla_4x6
+class cls_a64_smallK_hybrid_fp32_mla_6x4
 {
 public:
     typedef float operand_type;
@@ -73,9 +75,9 @@ class smallK_hybrid_fp32_mla_4x6
     StdTransformsFixed<operand_type, result_type, 6, 4, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_smallK_hybrid_fp32_mla_4x6;
+    kern_type kernel=a64_smallK_hybrid_fp32_mla_6x4;
 
-    smallK_hybrid_fp32_mla_4x6(const CPUInfo *)
+    cls_a64_smallK_hybrid_fp32_mla_6x4(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
index e2fec6af16..52548b462c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_fp32_mla_4x6(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
+void a64_smallK_hybrid_fp32_mla_6x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(float);
     const long ldcb = ldc * sizeof(float);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp
similarity index 91%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp
index e9a094855a..a8e0c24eae 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp
@@ -25,13 +25,15 @@
 
 #ifdef __aarch64__
 
+
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_smallK_hybrid_fp32_mla_4x8(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_smallK_hybrid_fp32_mla_8x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
 
-class smallK_hybrid_fp32_mla_4x8
+class cls_a64_smallK_hybrid_fp32_mla_8x4
 {
 public:
     typedef float operand_type;
@@ -73,9 +75,9 @@ class smallK_hybrid_fp32_mla_4x8
     StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_smallK_hybrid_fp32_mla_4x8;
+    kern_type kernel=a64_smallK_hybrid_fp32_mla_8x4;
 
-    smallK_hybrid_fp32_mla_4x8(const CPUInfo *)
+    cls_a64_smallK_hybrid_fp32_mla_8x4(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
index 11888bce74..deaef27ee9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
+void a64_smallK_hybrid_fp32_mla_8x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(float);
     const long ldcb = ldc * sizeof(float);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp
similarity index 87%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp
index fc087b73db..abf0eda008 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_6x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_6x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
 
-class smallK_hybrid_s8s32_dot_4x6
+class cls_a64_smallK_hybrid_s8s32_dot_6x4
 {
 public:
     typedef int8_t operand_type;
@@ -76,12 +76,12 @@ class smallK_hybrid_s8s32_dot_4x6
     StdTransformsFixed<operand_type, result_type, 6, 4, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_smallK_hybrid_s8s32_dot_4x6;
+    kern_type kernel=a64_smallK_hybrid_s8s32_dot_6x4;
 
-    smallK_hybrid_s8s32_dot_4x6(const CPUInfo *ci)
+    cls_a64_smallK_hybrid_s8s32_dot_6x4(const CPUInfo *ci)
     {
         if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_smallK_hybrid_s8s32_dot_4x6_a55;
+            kernel = a64_smallK_hybrid_s8s32_dot_6x4_a55;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp
similarity index 80%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp
index 2d6d2f064c..a9926602fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_6x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(int8_t);
     const long ldcb = ldc * sizeof(int32_t);
@@ -97,6 +97,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -107,18 +108,29 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2]\n"
                     "ldr s11, [a_ptr3]\n"
                     "ldr s14, [a_ptr4]\n"
                     "ldr s17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b2, [%[a_ptr0]]\n"
                     "ldr b5, [a_ptr1]\n"
@@ -145,40 +157,42 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v14.b}[2], [a_ptr4]\n"
                     "ld1 {v17.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
@@ -222,173 +236,219 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
-                    "ins v23.d[1], temploadreg3\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v22.d[1], temploadreg2\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
@@ -435,19 +495,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -514,6 +569,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -524,24 +580,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2]\n"
                     "ldr d11, [a_ptr3]\n"
                     "ldr d14, [a_ptr4]\n"
                     "ldr d17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr s2, [%[a_ptr0]], #0x4\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1], #0x4\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2], #0x4\n"
                     "ldr s11, [a_ptr3], #0x4\n"
                     "ldr s14, [a_ptr4], #0x4\n"
                     "ldr s17, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[4], [a_ptr1]\n"
@@ -568,38 +635,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v14.b}[6], [a_ptr4]\n"
                     "ld1 {v17.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -652,180 +721,233 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
-                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
-                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
-                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
-                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
-                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
-                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
-                    "ins v22.d[1], temploadreg2\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v23.d[1], temploadreg3\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -881,19 +1003,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1014,38 +1131,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v14.b}[10], [a_ptr4]\n"
                     "ld1 {v17.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1105,189 +1224,249 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
                     ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
-                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
-                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
-                    ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
-                    "ins v22.d[1], temploadreg2\n"
-                    ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
-                    ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
-                    ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
-                    ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
-                    "ins v23.d[1], temploadreg3\n"
-                    "ins v20.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1350,19 +1529,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1429,6 +1603,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -1441,7 +1616,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q10, [a_ptr3], #0x10\n"
                     "ldr q13, [a_ptr4], #0x10\n"
                     "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q2, [%[a_ptr0]]\n"
                     "ldr q5, [a_ptr1]\n"
                     "ldr q8, [a_ptr2]\n"
@@ -1450,8 +1624,21 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr d2, [%[a_ptr0]], #0x8\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1], #0x8\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2], #0x8\n"
                     "ldr d11, [a_ptr3], #0x8\n"
                     "ldr d14, [a_ptr4], #0x8\n"
@@ -1462,7 +1649,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v11.s}[2], [a_ptr3], #4\n"
                     "ld1 {v14.s}[2], [a_ptr4], #4\n"
                     "ld1 {v17.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1489,38 +1675,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v14.b}[14], [a_ptr4]\n"
                     "ld1 {v17.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1587,198 +1775,265 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    "ldr d21, [%[b_ptr0], #0x30]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+                    ".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
+                    ".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
+                    ".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
+                    ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
+                    ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
+                    ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
                     ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
                     ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
-                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
-                    "ins v23.d[1], temploadreg3\n"
                     ".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
                     ".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "ins v21.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1848,19 +2103,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1927,6 +2177,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -1943,18 +2194,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2]\n"
                     "ldr s15, [a_ptr3]\n"
                     "ldr s19, [a_ptr4]\n"
                     "ldr s23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b3, [%[a_ptr0]]\n"
                     "ldr b7, [a_ptr1]\n"
@@ -1981,24 +2249,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v19.b}[2], [a_ptr4]\n"
                     "ld1 {v23.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2091,57 +2361,55 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "ins v25.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q29, [c_ptr3]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
                     ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2235,50 +2503,149 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
                     ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
                     ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
                     ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    "str q28, [c_ptr2]\n"
+                    "movi v28.4s, #0\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    "str q29, [c_ptr3]\n"
+                    "movi v29.4s, #0\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    "str q30, [c_ptr4]\n"
+                    "movi v30.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "str q31, [c_ptr5]\n"
+                    "movi v31.4s, #0\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
-                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
@@ -2366,19 +2733,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2445,6 +2807,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -2461,24 +2824,41 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2]\n"
                     "ldr d15, [a_ptr3]\n"
                     "ldr d19, [a_ptr4]\n"
                     "ldr d23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr s3, [%[a_ptr0]], #0x4\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1], #0x4\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2], #0x4\n"
                     "ldr s15, [a_ptr3], #0x4\n"
                     "ldr s19, [a_ptr4], #0x4\n"
                     "ldr s23, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2505,24 +2885,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v19.b}[6], [a_ptr4]\n"
                     "ld1 {v23.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2622,68 +3004,66 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
                     ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
@@ -2727,98 +3107,204 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
                     ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr d24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr d24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    "str q28, [c_ptr2]\n"
+                    "movi v28.4s, #0\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    "str q29, [c_ptr3]\n"
+                    "movi v29.4s, #0\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    "str q30, [c_ptr4]\n"
+                    "movi v30.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "str q31, [c_ptr5]\n"
+                    "movi v31.4s, #0\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
                     ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
                     ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
                     ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
                     ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
                     ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
                     ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
                     ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
                     ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
                     ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
                     ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
                     ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
                     ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
                     ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
                     ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
-                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
@@ -2913,19 +3399,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3052,24 +3533,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v19.b}[10], [a_ptr4]\n"
                     "ld1 {v23.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3177,57 +3660,55 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "ins v25.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q29, [c_ptr3]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
                     ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
@@ -3340,50 +3821,164 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
                     ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    "str q28, [c_ptr2]\n"
+                    "movi v28.4s, #0\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    "str q29, [c_ptr3]\n"
+                    "movi v29.4s, #0\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    "str q30, [c_ptr4]\n"
+                    "movi v30.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "str q31, [c_ptr5]\n"
+                    "movi v31.4s, #0\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+                    ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
-                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
@@ -3486,19 +4081,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3566,6 +4156,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -3584,7 +4175,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q14, [a_ptr3], #0x10\n"
                     "ldr q18, [a_ptr4], #0x10\n"
                     "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q3, [%[a_ptr0]]\n"
                     "ldr q7, [a_ptr1]\n"
                     "ldr q11, [a_ptr2]\n"
@@ -3593,8 +4183,27 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ldr q23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr d3, [%[a_ptr0]], #0x8\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1], #0x8\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2], #0x8\n"
                     "ldr d15, [a_ptr3], #0x8\n"
                     "ldr d19, [a_ptr4], #0x8\n"
@@ -3605,7 +4214,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v15.s}[2], [a_ptr3], #4\n"
                     "ld1 {v19.s}[2], [a_ptr4], #4\n"
                     "ld1 {v23.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3632,24 +4240,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v19.b}[14], [a_ptr4]\n"
                     "ld1 {v23.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3764,68 +4374,66 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
                     ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
@@ -3936,27 +4544,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
                     ".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -4089,19 +4693,139 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+                    ".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
+                    ".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
+                    ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
+                    ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
+                    ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp
similarity index 80%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp
index 88ad36a27a..9ff39719f7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_6x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(int8_t);
     const long ldcb = ldc * sizeof(int32_t);
@@ -93,6 +93,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -103,18 +104,29 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2]\n"
                     "ldr s11, [a_ptr3]\n"
                     "ldr s14, [a_ptr4]\n"
                     "ldr s17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b2, [%[a_ptr0]]\n"
                     "ldr b5, [a_ptr1]\n"
@@ -141,40 +153,42 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v14.b}[2], [a_ptr4]\n"
                     "ld1 {v17.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
@@ -218,139 +232,201 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
                     ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
@@ -397,19 +473,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -468,6 +539,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -478,24 +550,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2]\n"
                     "ldr d11, [a_ptr3]\n"
                     "ldr d14, [a_ptr4]\n"
                     "ldr d17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr s2, [%[a_ptr0]], #0x4\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1], #0x4\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2], #0x4\n"
                     "ldr s11, [a_ptr3], #0x4\n"
                     "ldr s14, [a_ptr4], #0x4\n"
                     "ldr s17, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[4], [a_ptr1]\n"
@@ -522,38 +605,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v14.b}[6], [a_ptr4]\n"
                     "ld1 {v17.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -606,144 +691,213 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
                     ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -799,19 +953,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -924,38 +1073,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v14.b}[10], [a_ptr4]\n"
                     "ld1 {v17.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1015,62 +1166,60 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1081,85 +1230,163 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
                     ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1222,19 +1449,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1293,6 +1515,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -1305,7 +1528,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q10, [a_ptr3], #0x10\n"
                     "ldr q13, [a_ptr4], #0x10\n"
                     "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q2, [%[a_ptr0]]\n"
                     "ldr q5, [a_ptr1]\n"
                     "ldr q8, [a_ptr2]\n"
@@ -1314,8 +1536,21 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr d2, [%[a_ptr0]], #0x8\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1], #0x8\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2], #0x8\n"
                     "ldr d11, [a_ptr3], #0x8\n"
                     "ldr d14, [a_ptr4], #0x8\n"
@@ -1326,7 +1561,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v11.s}[2], [a_ptr3], #4\n"
                     "ld1 {v14.s}[2], [a_ptr4], #4\n"
                     "ld1 {v17.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1353,38 +1587,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v14.b}[14], [a_ptr4]\n"
                     "ld1 {v17.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1451,62 +1687,60 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1524,87 +1758,87 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
                     ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
                     ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
                     ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
                     ".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
                     ".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
                     ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
                     ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
@@ -1672,47 +1906,127 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
-                    "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                );
-                break;
-            case 13:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "c_ptr1 .req X5\n"
-                    "c_ptr2 .req X6\n"
-                    "c_ptr3 .req X7\n"
-                    "c_ptr4 .req X8\n"
-                    "c_ptr5 .req X9\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+                    ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
+                    ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
+                    ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
+                    ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+                    ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+                    ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+                    ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+                    ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+                    ".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
+                    ".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
+                    ".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
+                    ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
+                    ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
+                    ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
+                    "9:\n"
+                    "str q26, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "str q28, [c_ptr2]\n"
+                    "str q29, [c_ptr3]\n"
+                    "str q30, [c_ptr4]\n"
+                    "str q31, [c_ptr5]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq a_ptr4\n"
+                    ".unreq a_ptr5\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    ".unreq c_ptr4\n"
+                    ".unreq c_ptr5\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [odds] "+r" (odds)
+                    : [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+                );
+                break;
+            case 13:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "a_ptr4 .req X3\n"
+                    "a_ptr5 .req X4\n"
+                    "c_ptr1 .req X5\n"
+                    "c_ptr2 .req X6\n"
+                    "c_ptr3 .req X7\n"
+                    "c_ptr4 .req X8\n"
+                    "c_ptr5 .req X9\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                     "add c_ptr1, %[c_ptr0], %[ldc]\n"
                     "add a_ptr2, a_ptr1, %[lda]\n"
                     "add c_ptr2, c_ptr1, %[ldc]\n"
@@ -1743,6 +2057,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -1759,18 +2074,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2]\n"
                     "ldr s15, [a_ptr3]\n"
                     "ldr s19, [a_ptr4]\n"
                     "ldr s23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b3, [%[a_ptr0]]\n"
                     "ldr b7, [a_ptr1]\n"
@@ -1797,24 +2129,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v19.b}[2], [a_ptr4]\n"
                     "ld1 {v23.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -1907,38 +2241,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2028,20 +2360,20 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
                     ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
                     ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
                     ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
                     ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -2152,19 +2484,117 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2223,6 +2653,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -2239,24 +2670,41 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2]\n"
                     "ldr d15, [a_ptr3]\n"
                     "ldr d19, [a_ptr4]\n"
                     "ldr d23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr s3, [%[a_ptr0]], #0x4\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1], #0x4\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2], #0x4\n"
                     "ldr s15, [a_ptr3], #0x4\n"
                     "ldr s19, [a_ptr4], #0x4\n"
                     "ldr s23, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2283,33 +2731,167 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v19.b}[6], [a_ptr4]\n"
                     "ld1 {v23.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
                     "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
-                    "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
-                    "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
-                    "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "str q28, [c_ptr2]\n"
+                    "movi v28.4s, #0\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "str q29, [c_ptr3]\n"
+                    "movi v29.4s, #0\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "str q30, [c_ptr4]\n"
+                    "movi v30.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "str q31, [c_ptr5]\n"
+                    "movi v31.4s, #0\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2400,50 +2982,41 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
+                    "b.ne 8b\n"
+                    "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2528,43 +3101,28 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
                     ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
                     ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
-                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
@@ -2659,19 +3217,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2790,33 +3343,175 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v19.b}[10], [a_ptr4]\n"
                     "ld1 {v23.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+                    ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    "str q28, [c_ptr2]\n"
+                    "movi v28.4s, #0\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    "str q29, [c_ptr3]\n"
+                    "movi v29.4s, #0\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    "str q30, [c_ptr4]\n"
+                    "movi v30.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    "str q31, [c_ptr5]\n"
+                    "movi v31.4s, #0\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2915,50 +3610,41 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "cbz %[loops], 6f\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "str q27, [c_ptr1]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
@@ -3051,43 +3737,28 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
                     ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
-                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
                     ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
                     ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
@@ -3190,19 +3861,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3262,6 +3928,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -3280,7 +3947,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q14, [a_ptr3], #0x10\n"
                     "ldr q18, [a_ptr4], #0x10\n"
                     "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q3, [%[a_ptr0]]\n"
                     "ldr q7, [a_ptr1]\n"
                     "ldr q11, [a_ptr2]\n"
@@ -3289,8 +3955,27 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ldr q23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr d3, [%[a_ptr0]], #0x8\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1], #0x8\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2], #0x8\n"
                     "ldr d15, [a_ptr3], #0x8\n"
                     "ldr d19, [a_ptr4], #0x8\n"
@@ -3301,7 +3986,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v15.s}[2], [a_ptr3], #4\n"
                     "ld1 {v19.s}[2], [a_ptr4], #4\n"
                     "ld1 {v23.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3328,24 +4012,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v19.b}[14], [a_ptr4]\n"
                     "ld1 {v23.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3460,38 +4146,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "str q27, [c_ptr1]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3603,7 +4287,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
                     ".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
@@ -3615,6 +4298,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
@@ -3749,19 +4433,139 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+                    ".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
+                    ".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
+                    ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
+                    ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
+                    ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp
similarity index 87%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp
index 3de708cc68..9f9c2a49db 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_8x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_8x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
 
-class smallK_hybrid_s8s32_dot_4x8
+class cls_a64_smallK_hybrid_s8s32_dot_8x4
 {
 public:
     typedef int8_t operand_type;
@@ -76,12 +76,12 @@ class smallK_hybrid_s8s32_dot_4x8
     StdTransformsFixed<operand_type, result_type, 8, 4, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_smallK_hybrid_s8s32_dot_4x8;
+    kern_type kernel=a64_smallK_hybrid_s8s32_dot_8x4;
 
-    smallK_hybrid_s8s32_dot_4x8(const CPUInfo *ci)
+    cls_a64_smallK_hybrid_s8s32_dot_8x4(const CPUInfo *ci)
     {
         if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_smallK_hybrid_s8s32_dot_4x8_a55;
+            kernel = a64_smallK_hybrid_s8s32_dot_8x4_a55;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp
similarity index 85%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp
index 7135f2eee6..aba6e0d100 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_8x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(int8_t);
     const long ldcb = ldc * sizeof(int32_t);
@@ -157,22 +157,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v6.b}[2], [a_ptr6]\n"
                     "ld1 {v7.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "movi v26.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "movi v27.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "movi v28.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "movi v29.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "movi v30.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "movi v31.4s, #0\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
@@ -181,55 +183,49 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                    "ins v16.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "movi v26.4s, #0\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    "movi v26.4s, #0\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
                     "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
-                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
@@ -239,6 +235,8 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
@@ -268,23 +266,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -423,24 +432,26 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v6.b}[6], [a_ptr6]\n"
                     "ld1 {v7.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
                     "movi v26.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
@@ -456,78 +467,72 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v16.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v17.d[1], temploadreg1\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "ins v17.d[1], temploadreg1\n"
                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v17.d[1], temploadreg1\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
@@ -565,23 +570,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -720,26 +744,28 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v6.b}[10], [a_ptr6]\n"
                     "ld1 {v7.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
@@ -762,95 +788,86 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ins v17.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
                     "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
                     ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
@@ -876,8 +893,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -893,23 +911,50 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1056,28 +1101,30 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v6.b}[14], [a_ptr6]\n"
                     "ld1 {v7.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1107,112 +1154,101 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ins v18.d[1], temploadreg2\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v19.d[1], temploadreg3\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
                     ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
                     ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
                     ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v19.d[1], temploadreg3\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1235,8 +1271,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1260,23 +1297,58 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
+                    ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
+                    ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
+                    ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1363,26 +1435,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s9, [a_ptr4]\n"
                     "ldr s11, [a_ptr5]\n"
                     "ldr s13, [a_ptr6]\n"
                     "ldr s15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b1, [%[a_ptr0]]\n"
                     "ldr b3, [a_ptr1]\n"
@@ -1415,30 +1495,32 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v13.b}[2], [a_ptr6]\n"
                     "ld1 {v15.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1475,126 +1557,113 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q24, [%[c_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v20.d[1], temploadreg0\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v20.d[1], temploadreg0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1617,8 +1686,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1650,23 +1720,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1753,34 +1866,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d9, [a_ptr4]\n"
                     "ldr d11, [a_ptr5]\n"
                     "ldr d13, [a_ptr6]\n"
                     "ldr d15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr s1, [%[a_ptr0]], #0x4\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr s3, [a_ptr1], #0x4\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr2], #0x4\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s7, [a_ptr3], #0x4\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s9, [a_ptr4], #0x4\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s11, [a_ptr5], #0x4\n"
                     "ldr s13, [a_ptr6], #0x4\n"
                     "ldr s15, [a_ptr7], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1813,32 +1934,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v13.b}[6], [a_ptr6]\n"
                     "ld1 {v15.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1882,49 +2005,132 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v24.4s, #0\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q25, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
                     "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v26.4s, #0\n"
                     "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q27, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q28, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x40]\n"
                     "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q29, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "str q30, [c_ptr6]\n"
+                    "movi v30.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "str q31, [c_ptr7]\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q24, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v24.4s, #0\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "ins v21.d[1], temploadreg1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
@@ -1941,105 +2147,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q24, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v24.4s, #0\n"
-                    "ins v21.d[1], temploadreg1\n"
-                    "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
-                    "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2079,23 +2246,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2242,34 +2402,36 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v13.b}[10], [a_ptr6]\n"
                     "ld1 {v15.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2320,178 +2482,162 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ins v20.d[1], temploadreg0\n"
-                    "ins v21.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v22.d[1], temploadreg2\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr d21, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ldr d22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
                     ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v22.d[1], temploadreg2\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2539,23 +2685,82 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2643,6 +2848,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
@@ -2651,7 +2857,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ldr q10, [a_ptr5], #0x10\n"
                     "ldr q12, [a_ptr6], #0x10\n"
                     "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q1, [%[a_ptr0]]\n"
                     "ldr q3, [a_ptr1]\n"
                     "ldr q5, [a_ptr2]\n"
@@ -2662,15 +2867,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ldr q15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr d1, [%[a_ptr0]], #0x8\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr d3, [a_ptr1], #0x8\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr2], #0x8\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d7, [a_ptr3], #0x8\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d9, [a_ptr4], #0x8\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d11, [a_ptr5], #0x8\n"
                     "ldr d13, [a_ptr6], #0x8\n"
-                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v3.s}[2], [a_ptr1], #4\n"
                     "ld1 {v5.s}[2], [a_ptr2], #4\n"
                     "ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2678,7 +2892,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v11.s}[2], [a_ptr5], #4\n"
                     "ld1 {v13.s}[2], [a_ptr6], #4\n"
                     "ld1 {v15.s}[2], [a_ptr7], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2711,36 +2924,38 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     "ld1 {v13.b}[14], [a_ptr6]\n"
                     "ld1 {v15.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "movi v31.4s, #0\n"
                     "ldr q23, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2798,192 +3013,248 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                    "ldr d23, [%[b_ptr0], #0x70]\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v20.d[1], temploadreg0\n"
-                    "ins v21.d[1], temploadreg1\n"
-                    "ins v22.d[1], temploadreg2\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v23.d[1], temploadreg3\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr d21, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ldr d22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr d23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ins v23.d[1], temploadreg3\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
                     ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
                     ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
                     ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
-                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
                     ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
-                    "ldr d23, [%[b_ptr0], #0x70]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v23.d[1], temploadreg3\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+                    ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
+                    ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
+                    ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
+                    ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
+                    ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
+                    ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
+                    ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
+                    ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -3039,23 +3310,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp
similarity index 85%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp
index c94e975754..7fcf853d2e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_8x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(int8_t);
     const long ldcb = ldc * sizeof(int32_t);
@@ -153,22 +153,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v6.b}[2], [a_ptr6]\n"
                     "ld1 {v7.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "movi v26.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "movi v27.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "movi v28.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "movi v29.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "movi v30.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "movi v31.4s, #0\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
@@ -177,20 +179,17 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
@@ -216,10 +215,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -231,6 +229,8 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
@@ -260,23 +260,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -407,24 +418,26 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v6.b}[6], [a_ptr6]\n"
                     "ld1 {v7.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
                     "movi v26.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
@@ -440,68 +453,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "str q25, [c_ptr1]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q25, [c_ptr1]\n"
@@ -541,23 +552,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -688,26 +718,28 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v6.b}[10], [a_ptr6]\n"
                     "ld1 {v7.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
@@ -730,49 +762,46 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
@@ -788,7 +817,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
@@ -802,11 +830,12 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
@@ -832,8 +861,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -849,23 +879,50 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1004,28 +1061,30 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v6.b}[14], [a_ptr6]\n"
                     "ld1 {v7.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1055,50 +1114,47 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
@@ -1114,7 +1170,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
@@ -1123,7 +1178,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
                     ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
@@ -1137,14 +1191,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1167,8 +1223,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1192,23 +1249,58 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
+                    ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
+                    ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
+                    ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1287,26 +1379,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s9, [a_ptr4]\n"
                     "ldr s11, [a_ptr5]\n"
                     "ldr s13, [a_ptr6]\n"
                     "ldr s15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b1, [%[a_ptr0]]\n"
                     "ldr b3, [a_ptr1]\n"
@@ -1339,30 +1439,32 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v13.b}[2], [a_ptr6]\n"
                     "ld1 {v15.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1399,51 +1501,48 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
@@ -1459,7 +1558,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1468,7 +1566,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1477,7 +1574,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1491,14 +1587,17 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1521,8 +1620,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1554,23 +1654,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1649,34 +1792,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
+                    "ldr d1, [%[a_ptr0]]\n"
                     "ldr q8, [a_ptr4], #0x10\n"
+                    "ldr d3, [a_ptr1]\n"
                     "ldr q10, [a_ptr5], #0x10\n"
+                    "ldr d5, [a_ptr2]\n"
                     "ldr q12, [a_ptr6], #0x10\n"
+                    "ldr d7, [a_ptr3]\n"
                     "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
-                    "ldr d1, [%[a_ptr0]]\n"
-                    "ldr d3, [a_ptr1]\n"
-                    "ldr d5, [a_ptr2]\n"
-                    "ldr d7, [a_ptr3]\n"
                     "ldr d9, [a_ptr4]\n"
                     "ldr d11, [a_ptr5]\n"
                     "ldr d13, [a_ptr6]\n"
                     "ldr d15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr s1, [%[a_ptr0]], #0x4\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr s3, [a_ptr1], #0x4\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr2], #0x4\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s7, [a_ptr3], #0x4\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s9, [a_ptr4], #0x4\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s11, [a_ptr5], #0x4\n"
                     "ldr s13, [a_ptr6], #0x4\n"
                     "ldr s15, [a_ptr7], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1709,32 +1860,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v13.b}[6], [a_ptr6]\n"
                     "ld1 {v15.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1778,68 +1931,64 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1848,7 +1997,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1857,7 +2005,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1866,7 +2013,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
@@ -1880,20 +2026,25 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
@@ -1910,7 +2061,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
@@ -1951,23 +2102,74 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2106,34 +2308,36 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v13.b}[10], [a_ptr6]\n"
                     "ld1 {v15.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2184,34 +2388,31 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
@@ -2230,24 +2431,23 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
@@ -2255,7 +2455,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2264,7 +2463,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2273,7 +2471,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2282,7 +2479,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2296,38 +2492,44 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2375,23 +2577,82 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2471,6 +2732,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
@@ -2479,7 +2741,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ldr q10, [a_ptr5], #0x10\n"
                     "ldr q12, [a_ptr6], #0x10\n"
                     "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q1, [%[a_ptr0]]\n"
                     "ldr q3, [a_ptr1]\n"
                     "ldr q5, [a_ptr2]\n"
@@ -2490,15 +2751,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ldr q15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr d1, [%[a_ptr0]], #0x8\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr d3, [a_ptr1], #0x8\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr2], #0x8\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d7, [a_ptr3], #0x8\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d9, [a_ptr4], #0x8\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d11, [a_ptr5], #0x8\n"
                     "ldr d13, [a_ptr6], #0x8\n"
-                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v3.s}[2], [a_ptr1], #4\n"
                     "ld1 {v5.s}[2], [a_ptr2], #4\n"
                     "ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2506,7 +2776,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v11.s}[2], [a_ptr5], #4\n"
                     "ld1 {v13.s}[2], [a_ptr6], #4\n"
                     "ld1 {v15.s}[2], [a_ptr7], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2539,36 +2808,38 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "ld1 {v13.b}[14], [a_ptr6]\n"
                     "ld1 {v15.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "movi v31.4s, #0\n"
                     "ldr q23, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2626,39 +2897,37 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q23, [%[b_ptr0], #0x70]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
@@ -2673,32 +2942,29 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2707,7 +2973,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2716,7 +2981,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2725,7 +2989,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2734,7 +2997,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
                     ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
                     ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
                     ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
@@ -2748,38 +3010,119 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q23, [%[b_ptr0], #0x70]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+                    ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
+                    ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
+                    ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
+                    ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
+                    ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
+                    ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
+                    ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
+                    ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2835,23 +3178,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp
similarity index 87%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp
index 76931db4dd..5d48a52d42 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_6x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_6x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
 
-class smallK_hybrid_u8u32_dot_4x6
+class cls_a64_smallK_hybrid_u8u32_dot_6x4
 {
 public:
     typedef uint8_t operand_type;
@@ -76,12 +76,12 @@ class smallK_hybrid_u8u32_dot_4x6
     StdTransformsFixed<operand_type, result_type, 6, 4, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_smallK_hybrid_u8u32_dot_4x6;
+    kern_type kernel=a64_smallK_hybrid_u8u32_dot_6x4;
 
-    smallK_hybrid_u8u32_dot_4x6(const CPUInfo *ci)
+    cls_a64_smallK_hybrid_u8u32_dot_6x4(const CPUInfo *ci)
     {
         if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_smallK_hybrid_u8u32_dot_4x6_a55;
+            kernel = a64_smallK_hybrid_u8u32_dot_6x4_a55;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp
similarity index 80%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp
index 02894d8327..dddf4c5aa2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_6x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(uint8_t);
     const long ldcb = ldc * sizeof(uint32_t);
@@ -97,6 +97,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -107,18 +108,29 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2]\n"
                     "ldr s11, [a_ptr3]\n"
                     "ldr s14, [a_ptr4]\n"
                     "ldr s17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b2, [%[a_ptr0]]\n"
                     "ldr b5, [a_ptr1]\n"
@@ -145,40 +157,42 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v14.b}[2], [a_ptr4]\n"
                     "ld1 {v17.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
@@ -222,173 +236,219 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
-                    "ins v23.d[1], temploadreg3\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v22.d[1], temploadreg2\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
@@ -435,19 +495,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -514,6 +569,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -524,24 +580,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2]\n"
                     "ldr d11, [a_ptr3]\n"
                     "ldr d14, [a_ptr4]\n"
                     "ldr d17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr s2, [%[a_ptr0]], #0x4\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1], #0x4\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2], #0x4\n"
                     "ldr s11, [a_ptr3], #0x4\n"
                     "ldr s14, [a_ptr4], #0x4\n"
                     "ldr s17, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[4], [a_ptr1]\n"
@@ -568,38 +635,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v14.b}[6], [a_ptr4]\n"
                     "ld1 {v17.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -652,180 +721,233 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
-                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
-                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
-                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
-                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
-                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
-                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
-                    "ins v22.d[1], temploadreg2\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v23.d[1], temploadreg3\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -881,19 +1003,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1014,38 +1131,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v14.b}[10], [a_ptr4]\n"
                     "ld1 {v17.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1105,189 +1224,249 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
                     ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
-                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
-                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
-                    ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
-                    "ins v22.d[1], temploadreg2\n"
-                    ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
-                    ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
-                    ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
-                    ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
-                    "ins v23.d[1], temploadreg3\n"
-                    "ins v20.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1350,19 +1529,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1429,6 +1603,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -1441,7 +1616,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q10, [a_ptr3], #0x10\n"
                     "ldr q13, [a_ptr4], #0x10\n"
                     "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q2, [%[a_ptr0]]\n"
                     "ldr q5, [a_ptr1]\n"
                     "ldr q8, [a_ptr2]\n"
@@ -1450,8 +1624,21 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr d2, [%[a_ptr0]], #0x8\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1], #0x8\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2], #0x8\n"
                     "ldr d11, [a_ptr3], #0x8\n"
                     "ldr d14, [a_ptr4], #0x8\n"
@@ -1462,7 +1649,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v11.s}[2], [a_ptr3], #4\n"
                     "ld1 {v14.s}[2], [a_ptr4], #4\n"
                     "ld1 {v17.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1489,38 +1675,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v14.b}[14], [a_ptr4]\n"
                     "ld1 {v17.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1587,198 +1775,265 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d18, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0]]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q30, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v30.4s, #0\n"
                     "ldr d22, [%[b_ptr0], #0x40]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q31, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "ldr d23, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     "ldr d24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x70]\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     "ins v22.d[1], temploadreg2\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     "ins v23.d[1], temploadreg3\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v26.4s, #0\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "ldr d18, [%[b_ptr0]]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    "ldr d21, [%[b_ptr0], #0x30]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    "ins v21.d[1], temploadreg1\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+                    ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
+                    ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
+                    ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
+                    ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
+                    ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
+                    ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
                     ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
-                    "ldr d22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr d23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr d24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr d18, [%[b_ptr0]]\n"
                     ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr d19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
-                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr d20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
-                    "ins v23.d[1], temploadreg3\n"
                     ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
                     ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
-                    "ldr d21, [%[b_ptr0], #0x30]\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "ins v21.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1848,19 +2103,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1927,6 +2177,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -1943,18 +2194,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2]\n"
                     "ldr s15, [a_ptr3]\n"
                     "ldr s19, [a_ptr4]\n"
                     "ldr s23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b3, [%[a_ptr0]]\n"
                     "ldr b7, [a_ptr1]\n"
@@ -1981,24 +2249,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v19.b}[2], [a_ptr4]\n"
                     "ld1 {v23.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2091,57 +2361,55 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "ins v25.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q29, [c_ptr3]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
                     ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2235,50 +2503,149 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
                     ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
                     ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
                     ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    "str q28, [c_ptr2]\n"
+                    "movi v28.4s, #0\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    "str q29, [c_ptr3]\n"
+                    "movi v29.4s, #0\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    "str q30, [c_ptr4]\n"
+                    "movi v30.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "str q31, [c_ptr5]\n"
+                    "movi v31.4s, #0\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
-                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
@@ -2366,19 +2733,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2445,6 +2807,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -2461,24 +2824,41 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2]\n"
                     "ldr d15, [a_ptr3]\n"
                     "ldr d19, [a_ptr4]\n"
                     "ldr d23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr s3, [%[a_ptr0]], #0x4\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1], #0x4\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2], #0x4\n"
                     "ldr s15, [a_ptr3], #0x4\n"
                     "ldr s19, [a_ptr4], #0x4\n"
                     "ldr s23, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2505,24 +2885,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v19.b}[6], [a_ptr4]\n"
                     "ld1 {v23.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2622,68 +3004,66 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
                     ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
@@ -2727,98 +3107,204 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
                     ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr d24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr d24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    "str q28, [c_ptr2]\n"
+                    "movi v28.4s, #0\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    "str q29, [c_ptr3]\n"
+                    "movi v29.4s, #0\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    "str q30, [c_ptr4]\n"
+                    "movi v30.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "str q31, [c_ptr5]\n"
+                    "movi v31.4s, #0\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
                     ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
                     ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
                     ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
                     ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
                     ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
                     ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
                     ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
                     ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
                     ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
                     ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
                     ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
                     ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
                     ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
                     ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
-                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
@@ -2913,19 +3399,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3052,24 +3533,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v19.b}[10], [a_ptr4]\n"
                     "ld1 {v23.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3177,57 +3660,55 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
-                    "ins v25.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q29, [c_ptr3]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
                     ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
@@ -3340,50 +3821,164 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
                     ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
-                    "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    "str q28, [c_ptr2]\n"
+                    "movi v28.4s, #0\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    "str q29, [c_ptr3]\n"
+                    "movi v29.4s, #0\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    "str q30, [c_ptr4]\n"
+                    "movi v30.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "str q31, [c_ptr5]\n"
+                    "movi v31.4s, #0\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+                    ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
-                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
@@ -3486,19 +4081,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3566,6 +4156,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -3584,7 +4175,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q14, [a_ptr3], #0x10\n"
                     "ldr q18, [a_ptr4], #0x10\n"
                     "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q3, [%[a_ptr0]]\n"
                     "ldr q7, [a_ptr1]\n"
                     "ldr q11, [a_ptr2]\n"
@@ -3593,8 +4183,27 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr d3, [%[a_ptr0]], #0x8\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1], #0x8\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2], #0x8\n"
                     "ldr d15, [a_ptr3], #0x8\n"
                     "ldr d19, [a_ptr4], #0x8\n"
@@ -3605,7 +4214,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v15.s}[2], [a_ptr3], #4\n"
                     "ld1 {v19.s}[2], [a_ptr4], #4\n"
                     "ld1 {v23.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3632,24 +4240,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v19.b}[14], [a_ptr4]\n"
                     "ld1 {v23.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3764,68 +4374,66 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v24.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr d24, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr d25, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q28, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v28.4s, #0\n"
+                    "ins v24.d[1], temploadreg0\n"
+                    "ins v25.d[1], temploadreg1\n"
                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr d25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
                     ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
                     "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
@@ -3936,27 +4544,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
                     "ins v25.d[1], temploadreg1\n"
                     ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr d24, [%[b_ptr0]]\n"
                     ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
                     ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
-                    "ins v24.d[1], temploadreg0\n"
                     ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
-                    "ldr d25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
-                    "ins v25.d[1], temploadreg1\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -4089,19 +4693,139 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+                    ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
+                    ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
+                    ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
+                    ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
+                    ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp
similarity index 80%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp
index fe69f744e2..10bd16aa59 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_6x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(uint8_t);
     const long ldcb = ldc * sizeof(uint32_t);
@@ -93,6 +93,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -103,18 +104,29 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2]\n"
                     "ldr s11, [a_ptr3]\n"
                     "ldr s14, [a_ptr4]\n"
                     "ldr s17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b2, [%[a_ptr0]]\n"
                     "ldr b5, [a_ptr1]\n"
@@ -141,40 +153,42 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v14.b}[2], [a_ptr4]\n"
                     "ld1 {v17.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
@@ -218,139 +232,201 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
                     ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
@@ -397,19 +473,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -468,6 +539,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -478,24 +550,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q7, [a_ptr2], #0x10\n"
                     "ldr q10, [a_ptr3], #0x10\n"
-                    "ldr q13, [a_ptr4], #0x10\n"
-                    "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d2, [%[a_ptr0]]\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1]\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2]\n"
                     "ldr d11, [a_ptr3]\n"
                     "ldr d14, [a_ptr4]\n"
                     "ldr d17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr s2, [%[a_ptr0]], #0x4\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr1], #0x4\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr s8, [a_ptr2], #0x4\n"
                     "ldr s11, [a_ptr3], #0x4\n"
                     "ldr s14, [a_ptr4], #0x4\n"
                     "ldr s17, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[4], [a_ptr1]\n"
@@ -522,38 +605,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v14.b}[6], [a_ptr4]\n"
                     "ld1 {v17.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -606,144 +691,213 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
                     ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
                     ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -799,19 +953,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -924,38 +1073,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v14.b}[10], [a_ptr4]\n"
                     "ld1 {v17.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1015,62 +1166,60 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1081,85 +1230,163 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
                     ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
                     ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
                     ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
                     ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
                     ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
                     "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1222,19 +1449,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1293,6 +1515,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q3, [a_ptr1], #0x10\n"
                     "ldr q6, [a_ptr2], #0x10\n"
@@ -1305,7 +1528,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q10, [a_ptr3], #0x10\n"
                     "ldr q13, [a_ptr4], #0x10\n"
                     "ldr q16, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q2, [%[a_ptr0]]\n"
                     "ldr q5, [a_ptr1]\n"
                     "ldr q8, [a_ptr2]\n"
@@ -1314,8 +1536,21 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q17, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q3, [a_ptr1], #0x10\n"
+                    "ldr q6, [a_ptr2], #0x10\n"
+                    "ldr q9, [a_ptr3], #0x10\n"
+                    "ldr q12, [a_ptr4], #0x10\n"
+                    "ldr q15, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q7, [a_ptr2], #0x10\n"
+                    "ldr q10, [a_ptr3], #0x10\n"
                     "ldr d2, [%[a_ptr0]], #0x8\n"
+                    "ldr q13, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr1], #0x8\n"
+                    "ldr q16, [a_ptr5], #0x10\n"
                     "ldr d8, [a_ptr2], #0x8\n"
                     "ldr d11, [a_ptr3], #0x8\n"
                     "ldr d14, [a_ptr4], #0x8\n"
@@ -1326,7 +1561,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v11.s}[2], [a_ptr3], #4\n"
                     "ld1 {v14.s}[2], [a_ptr4], #4\n"
                     "ld1 {v17.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v2.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1353,38 +1587,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v14.b}[14], [a_ptr4]\n"
                     "ld1 {v17.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q18, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "movi v31.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     "ldr q23, [%[b_ptr0], #0x50]\n"
-                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     "ldr q24, [%[b_ptr0], #0x60]\n"
-                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "ldr q25, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1451,62 +1687,60 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
                     "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
                     ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
                     "ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1524,87 +1758,87 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
                     ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
                     ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
-                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
                     ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
                     ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
                     ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
                     ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
                     ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
-                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
                     ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
                     ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
                     ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
                     ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
-                    "ldr q24, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
                     ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
                     ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
                     ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
                     ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
                     ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
                     ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
                     ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
                     ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
                     ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
                     ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
                     ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
                     ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
                     ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
                     ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
-                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
                     ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
                     ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
                     ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
                     ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
                     ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
-                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
                     ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
                     ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
                     ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
-                    "ldr q21, [%[b_ptr0], #0x30]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    "ldr q22, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    "ldr q24, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q25, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
-                    "ldr q18, [%[b_ptr0]]\n"
-                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
                     ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
                     ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
                     ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
@@ -1672,47 +1906,127 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
                     ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
                     ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
-                    "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                );
-                break;
-            case 13:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "c_ptr1 .req X5\n"
-                    "c_ptr2 .req X6\n"
-                    "c_ptr3 .req X7\n"
-                    "c_ptr4 .req X8\n"
-                    "c_ptr5 .req X9\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+                    ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+                    ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+                    ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+                    ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
+                    ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+                    "ldr q18, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
+                    ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
+                    ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+                    ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+                    ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+                    ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+                    "ldr q19, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+                    ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+                    ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+                    ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+                    ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+                    ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+                    "ldr q20, [%[b_ptr0], #0x20]\n"
+                    ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+                    ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+                    ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+                    ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+                    ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+                    ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+                    "ldr q21, [%[b_ptr0], #0x30]\n"
+                    ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+                    ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+                    ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+                    ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+                    ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+                    ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+                    ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+                    ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+                    ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+                    ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+                    ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+                    ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+                    ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+                    ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+                    ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+                    ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+                    ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+                    ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+                    ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+                    ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+                    ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+                    ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+                    ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+                    ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+                    ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+                    ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+                    ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+                    ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+                    ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
+                    ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
+                    ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
+                    ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
+                    ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
+                    ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
+                    "9:\n"
+                    "str q26, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "str q28, [c_ptr2]\n"
+                    "str q29, [c_ptr3]\n"
+                    "str q30, [c_ptr4]\n"
+                    "str q31, [c_ptr5]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq a_ptr4\n"
+                    ".unreq a_ptr5\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    ".unreq c_ptr4\n"
+                    ".unreq c_ptr5\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [odds] "+r" (odds)
+                    : [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+                );
+                break;
+            case 13:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "a_ptr4 .req X3\n"
+                    "a_ptr5 .req X4\n"
+                    "c_ptr1 .req X5\n"
+                    "c_ptr2 .req X6\n"
+                    "c_ptr3 .req X7\n"
+                    "c_ptr4 .req X8\n"
+                    "c_ptr5 .req X9\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                     "add c_ptr1, %[c_ptr0], %[ldc]\n"
                     "add a_ptr2, a_ptr1, %[lda]\n"
                     "add c_ptr2, c_ptr1, %[ldc]\n"
@@ -1743,6 +2057,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -1759,18 +2074,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2]\n"
                     "ldr s15, [a_ptr3]\n"
                     "ldr s19, [a_ptr4]\n"
                     "ldr s23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b3, [%[a_ptr0]]\n"
                     "ldr b7, [a_ptr1]\n"
@@ -1797,24 +2129,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v19.b}[2], [a_ptr4]\n"
                     "ld1 {v23.b}[2], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -1907,38 +2241,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "str q27, [c_ptr1]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2028,20 +2360,20 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
                     ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
                     ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
                     ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
                     ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
@@ -2152,19 +2484,117 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2223,6 +2653,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -2239,24 +2670,41 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q6, [a_ptr1], #0x10\n"
                     "ldr q10, [a_ptr2], #0x10\n"
                     "ldr q14, [a_ptr3], #0x10\n"
-                    "ldr q18, [a_ptr4], #0x10\n"
-                    "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d3, [%[a_ptr0]]\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1]\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2]\n"
                     "ldr d15, [a_ptr3]\n"
                     "ldr d19, [a_ptr4]\n"
                     "ldr d23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr s3, [%[a_ptr0]], #0x4\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr s7, [a_ptr1], #0x4\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr s11, [a_ptr2], #0x4\n"
                     "ldr s15, [a_ptr3], #0x4\n"
                     "ldr s19, [a_ptr4], #0x4\n"
                     "ldr s23, [a_ptr5], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2283,33 +2731,167 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v19.b}[6], [a_ptr4]\n"
                     "ld1 {v23.b}[6], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
                     "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
-                    "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
-                    "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
-                    "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
-                    "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "str q28, [c_ptr2]\n"
+                    "movi v28.4s, #0\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "str q29, [c_ptr3]\n"
+                    "movi v29.4s, #0\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "str q30, [c_ptr4]\n"
+                    "movi v30.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "str q31, [c_ptr5]\n"
+                    "movi v31.4s, #0\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2400,50 +2982,41 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "b.eq 7f\n"
-                    "8:\n"
+                    "b.ne 8b\n"
+                    "7:\n"
                     "str q26, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2528,43 +3101,28 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
                     ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
                     ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
                     ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
                     ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
-                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
@@ -2659,19 +3217,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2790,33 +3343,175 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v19.b}[10], [a_ptr4]\n"
                     "ld1 {v23.b}[10], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+                    ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q26, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q27, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    "str q28, [c_ptr2]\n"
+                    "movi v28.4s, #0\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    "str q29, [c_ptr3]\n"
+                    "movi v29.4s, #0\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    "str q30, [c_ptr4]\n"
+                    "movi v30.4s, #0\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    "str q31, [c_ptr5]\n"
+                    "movi v31.4s, #0\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2915,50 +3610,41 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "cbz %[loops], 6f\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q26, [%[c_ptr0]]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q26, [%[c_ptr0]]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "movi v26.4s, #0\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "str q27, [c_ptr1]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
@@ -3051,43 +3737,28 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
                     ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
                     ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
-                    "ldr q25, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
                     ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
                     ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q26, [%[c_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v26.4s, #0\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v27.4s, #0\n"
-                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
-                    "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
-                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
                     ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
                     ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
                     ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
@@ -3190,19 +3861,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3262,6 +3928,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q4, [a_ptr1], #0x10\n"
                     "ldr q8, [a_ptr2], #0x10\n"
@@ -3280,7 +3947,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q14, [a_ptr3], #0x10\n"
                     "ldr q18, [a_ptr4], #0x10\n"
                     "ldr q22, [a_ptr5], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q3, [%[a_ptr0]]\n"
                     "ldr q7, [a_ptr1]\n"
                     "ldr q11, [a_ptr2]\n"
@@ -3289,8 +3955,27 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q23, [a_ptr5]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q4, [a_ptr1], #0x10\n"
+                    "ldr q8, [a_ptr2], #0x10\n"
+                    "ldr q12, [a_ptr3], #0x10\n"
+                    "ldr q16, [a_ptr4], #0x10\n"
+                    "ldr q20, [a_ptr5], #0x10\n"
+                    "ldr q1, [%[a_ptr0]], #0x10\n"
+                    "ldr q5, [a_ptr1], #0x10\n"
+                    "ldr q9, [a_ptr2], #0x10\n"
+                    "ldr q13, [a_ptr3], #0x10\n"
+                    "ldr q17, [a_ptr4], #0x10\n"
+                    "ldr q21, [a_ptr5], #0x10\n"
+                    "ldr q2, [%[a_ptr0]], #0x10\n"
+                    "ldr q6, [a_ptr1], #0x10\n"
+                    "ldr q10, [a_ptr2], #0x10\n"
+                    "ldr q14, [a_ptr3], #0x10\n"
                     "ldr d3, [%[a_ptr0]], #0x8\n"
+                    "ldr q18, [a_ptr4], #0x10\n"
                     "ldr d7, [a_ptr1], #0x8\n"
+                    "ldr q22, [a_ptr5], #0x10\n"
                     "ldr d11, [a_ptr2], #0x8\n"
                     "ldr d15, [a_ptr3], #0x8\n"
                     "ldr d19, [a_ptr4], #0x8\n"
@@ -3301,7 +3986,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v15.s}[2], [a_ptr3], #4\n"
                     "ld1 {v19.s}[2], [a_ptr4], #4\n"
                     "ld1 {v23.s}[2], [a_ptr5], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v3.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3328,24 +4012,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v19.b}[14], [a_ptr4]\n"
                     "ld1 {v23.b}[14], [a_ptr5]\n"
                     "3:\n"
-                    "movi v26.4s, #0\n"
                     "ldr q24, [%[b_ptr0]]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v26.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3460,38 +4146,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q24, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "str q27, [c_ptr1]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v27.4s, #0\n"
+                    "str q27, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
                     "str q28, [c_ptr2]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
                     "str q29, [c_ptr3]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
                     "str q30, [c_ptr4]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
                     "str q31, [c_ptr5]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
                     "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3603,7 +4287,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
                     ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
                     ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
-                    "ldr q24, [%[b_ptr0]]\n"
                     ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
                     ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
                     ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
@@ -3615,6 +4298,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr q24, [%[b_ptr0]]\n"
                     "ldr q25, [%[b_ptr0], #0x10]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q27, [c_ptr1]\n"
@@ -3749,19 +4433,139 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
                     ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
                     ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+                    ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+                    ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+                    ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+                    ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+                    ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+                    ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+                    ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+                    ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+                    ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+                    ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+                    ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+                    ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+                    ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+                    ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+                    ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+                    ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+                    ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+                    ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+                    ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+                    ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+                    ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+                    ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+                    ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+                    ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+                    ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+                    ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+                    ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+                    ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+                    ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+                    ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+                    ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+                    ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+                    ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+                    ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+                    ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+                    ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+                    ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+                    ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+                    ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+                    ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+                    ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+                    ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+                    ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+                    ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+                    ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+                    ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+                    ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+                    ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+                    ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+                    ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+                    ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+                    ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+                    ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+                    ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+                    ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+                    ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+                    ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+                    "ldr q24, [%[b_ptr0]]\n"
+                    ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+                    ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+                    ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+                    ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+                    ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+                    ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+                    "ldr q25, [%[b_ptr0], #0x10]\n"
+                    ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+                    ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+                    ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+                    ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+                    ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+                    ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
+                    ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
+                    ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
+                    ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
+                    ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
+                    ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+                    "9:\n"
                     "str q26, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q27, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q28, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q29, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q30, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q31, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp
similarity index 87%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp
index d91416c3be..942f94b0bf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_8x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_8x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
 
-class smallK_hybrid_u8u32_dot_4x8
+class cls_a64_smallK_hybrid_u8u32_dot_8x4
 {
 public:
     typedef uint8_t operand_type;
@@ -76,12 +76,12 @@ class smallK_hybrid_u8u32_dot_4x8
     StdTransformsFixed<operand_type, result_type, 8, 4, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=a64_smallK_hybrid_u8u32_dot_4x8;
+    kern_type kernel=a64_smallK_hybrid_u8u32_dot_8x4;
 
-    smallK_hybrid_u8u32_dot_4x8(const CPUInfo *ci)
+    cls_a64_smallK_hybrid_u8u32_dot_8x4(const CPUInfo *ci)
     {
         if (ci->get_cpu_model() == CPUModel::A55r1) {
-            kernel = a64_smallK_hybrid_u8u32_dot_4x8_a55;
+            kernel = a64_smallK_hybrid_u8u32_dot_8x4_a55;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp
similarity index 85%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp
index e70fb6955e..fcb546f51e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_8x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(uint8_t);
     const long ldcb = ldc * sizeof(uint32_t);
@@ -157,22 +157,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v6.b}[2], [a_ptr6]\n"
                     "ld1 {v7.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "movi v26.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "movi v27.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "movi v28.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "movi v29.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "movi v30.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "movi v31.4s, #0\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
@@ -181,55 +183,49 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
-                    "ins v16.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "movi v26.4s, #0\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    "movi v26.4s, #0\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
                     "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
-                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
@@ -239,6 +235,8 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
@@ -268,23 +266,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -423,24 +432,26 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v6.b}[6], [a_ptr6]\n"
                     "ld1 {v7.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
                     "movi v26.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
@@ -456,78 +467,72 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "ins v16.d[1], temploadreg0\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v17.d[1], temploadreg1\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "ins v17.d[1], temploadreg1\n"
                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v17.d[1], temploadreg1\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
@@ -565,23 +570,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -720,26 +744,28 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v6.b}[10], [a_ptr6]\n"
                     "ld1 {v7.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
@@ -762,95 +788,86 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ins v17.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
                     "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
                     ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v18.d[1], temploadreg2\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
@@ -876,8 +893,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -893,23 +911,50 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1056,28 +1101,30 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v6.b}[14], [a_ptr6]\n"
                     "ld1 {v7.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1107,112 +1154,101 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ins v18.d[1], temploadreg2\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v19.d[1], temploadreg3\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
                     ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
                     ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
                     ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
                     ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v19.d[1], temploadreg3\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1235,8 +1271,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1260,23 +1297,58 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
+                    ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
+                    ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
+                    ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1363,26 +1435,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s9, [a_ptr4]\n"
                     "ldr s11, [a_ptr5]\n"
                     "ldr s13, [a_ptr6]\n"
                     "ldr s15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b1, [%[a_ptr0]]\n"
                     "ldr b3, [a_ptr1]\n"
@@ -1415,30 +1495,32 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v13.b}[2], [a_ptr6]\n"
                     "ld1 {v15.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1475,126 +1557,113 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q24, [%[c_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v20.d[1], temploadreg0\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
+                    "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ins v20.d[1], temploadreg0\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
                     ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v20.d[1], temploadreg0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1617,8 +1686,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1650,23 +1720,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1753,34 +1866,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr d1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d9, [a_ptr4]\n"
                     "ldr d11, [a_ptr5]\n"
                     "ldr d13, [a_ptr6]\n"
                     "ldr d15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr s1, [%[a_ptr0]], #0x4\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr s3, [a_ptr1], #0x4\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr2], #0x4\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s7, [a_ptr3], #0x4\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s9, [a_ptr4], #0x4\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s11, [a_ptr5], #0x4\n"
                     "ldr s13, [a_ptr6], #0x4\n"
                     "ldr s15, [a_ptr7], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1813,32 +1934,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v13.b}[6], [a_ptr6]\n"
                     "ld1 {v15.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1882,49 +2005,132 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "b.eq 7f\n"
+                    "8:\n"
+                    "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "movi v24.4s, #0\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "str q25, [c_ptr1]\n"
+                    "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
                     "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "movi v26.4s, #0\n"
                     "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "str q27, [c_ptr3]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "movi v27.4s, #0\n"
                     "ldr d19, [%[b_ptr0], #0x30]\n"
                     "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "str q28, [c_ptr4]\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "movi v28.4s, #0\n"
                     "ldr d20, [%[b_ptr0], #0x40]\n"
                     "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "str q29, [c_ptr5]\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "movi v29.4s, #0\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "str q30, [c_ptr6]\n"
+                    "movi v30.4s, #0\n"
                     "ldr d21, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "str q31, [c_ptr7]\n"
+                    "movi v31.4s, #0\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
                     "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "ins v19.d[1], temploadreg3\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "ins v20.d[1], temploadreg0\n"
-                    "b.eq 7f\n"
-                    "8:\n"
-                    "str q24, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "movi v24.4s, #0\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "ins v21.d[1], temploadreg1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    "b.ne 8b\n"
+                    "7:\n"
+                    "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
@@ -1941,105 +2147,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                     ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
                     ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
                     ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
-                    "b.ne 8b\n"
-                    "7:\n"
-                    "str q24, [%[c_ptr0]]\n"
-                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
+                    "b 9f\n"
+                    "6:\n"
                     "movi v24.4s, #0\n"
-                    "ins v21.d[1], temploadreg1\n"
-                    "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
-                    "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2079,23 +2246,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2242,34 +2402,36 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v13.b}[10], [a_ptr6]\n"
                     "ld1 {v15.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2320,178 +2482,162 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ins v20.d[1], temploadreg0\n"
-                    "ins v21.d[1], temploadreg1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v22.d[1], temploadreg2\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr d21, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ldr d22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                     ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
                     ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
                     ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
                     ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v22.d[1], temploadreg2\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2539,23 +2685,82 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2643,6 +2848,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
@@ -2651,7 +2857,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q10, [a_ptr5], #0x10\n"
                     "ldr q12, [a_ptr6], #0x10\n"
                     "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q1, [%[a_ptr0]]\n"
                     "ldr q3, [a_ptr1]\n"
                     "ldr q5, [a_ptr2]\n"
@@ -2662,15 +2867,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ldr q15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr d1, [%[a_ptr0]], #0x8\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr d3, [a_ptr1], #0x8\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr2], #0x8\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d7, [a_ptr3], #0x8\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d9, [a_ptr4], #0x8\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d11, [a_ptr5], #0x8\n"
                     "ldr d13, [a_ptr6], #0x8\n"
-                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v3.s}[2], [a_ptr1], #4\n"
                     "ld1 {v5.s}[2], [a_ptr2], #4\n"
                     "ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2678,7 +2892,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v11.s}[2], [a_ptr5], #4\n"
                     "ld1 {v13.s}[2], [a_ptr6], #4\n"
                     "ld1 {v15.s}[2], [a_ptr7], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2711,36 +2924,38 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     "ld1 {v13.b}[14], [a_ptr6]\n"
                     "ld1 {v15.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "movi v31.4s, #0\n"
                     "ldr q23, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2798,192 +3013,248 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr d16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
-                    "ins v16.d[1], temploadreg0\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
-                    "ins v17.d[1], temploadreg1\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
-                    "ins v18.d[1], temploadreg2\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                    "ldr d23, [%[b_ptr0], #0x70]\n"
-                    "ins v19.d[1], temploadreg3\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "ins v20.d[1], temploadreg0\n"
-                    "ins v21.d[1], temploadreg1\n"
-                    "ins v22.d[1], temploadreg2\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ins v23.d[1], temploadreg3\n"
+                    "ldr d16, [%[b_ptr0]]\n"
                     "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr d17, [%[b_ptr0], #0x10]\n"
                     "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "str q26, [c_ptr2]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     "movi v26.4s, #0\n"
+                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
-                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "str q27, [c_ptr3]\n"
-                    "movi v27.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
-                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr d19, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "str q28, [c_ptr4]\n"
-                    "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
-                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "movi v28.4s, #0\n"
+                    "ldr d20, [%[b_ptr0], #0x40]\n"
+                    "ins v16.d[1], temploadreg0\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "str q29, [c_ptr5]\n"
-                    "movi v29.4s, #0\n"
                     "add c_ptr5, c_ptr5, #0x10\n"
-                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "movi v29.4s, #0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr d21, [%[b_ptr0], #0x50]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    "ins v17.d[1], temploadreg1\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "ldr d22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    "ins v18.d[1], temploadreg2\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr d23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr d16, [%[b_ptr0]]\n"
+                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "ins v16.d[1], temploadreg0\n"
+                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "ins v23.d[1], temploadreg3\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr d17, [%[b_ptr0], #0x10]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
-                    "ins v17.d[1], temploadreg1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
-                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr d18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
-                    "ins v18.d[1], temploadreg2\n"
                     ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
-                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                     ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr d19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
-                    "ins v19.d[1], temploadreg3\n"
                     ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
-                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                     ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr d20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
-                    "ins v20.d[1], temploadreg0\n"
                     ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
                     ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr d21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
-                    "ins v21.d[1], temploadreg1\n"
                     ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
                     ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
-                    "ldr d22, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
                     ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
                     ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
-                    "ins v22.d[1], temploadreg2\n"
                     ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
                     ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
                     ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
-                    "ldr d23, [%[b_ptr0], #0x70]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ins v23.d[1], temploadreg3\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+                    ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
+                    ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
+                    ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
+                    ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
+                    ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
+                    ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
+                    ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
+                    ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -3039,23 +3310,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
                     ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp
similarity index 85%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp
index 2a7dd3d88d..aeea051662 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_8x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)4) - 1;
     const long ldab = lda * sizeof(uint8_t);
     const long ldcb = ldc * sizeof(uint32_t);
@@ -153,22 +153,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v6.b}[2], [a_ptr6]\n"
                     "ld1 {v7.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "movi v26.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "movi v27.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "movi v28.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "movi v29.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "movi v30.4s, #0\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "movi v31.4s, #0\n"
                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
@@ -177,20 +179,17 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
@@ -216,10 +215,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -231,6 +229,8 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
@@ -260,23 +260,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -407,24 +418,26 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v6.b}[6], [a_ptr6]\n"
                     "ld1 {v7.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
                     "movi v26.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
@@ -440,68 +453,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
-                    "str q25, [c_ptr1]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
                     "b.ne 8b\n"
                     "7:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
+                    "ldr q16, [%[b_ptr0]]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
                     "str q25, [c_ptr1]\n"
@@ -541,23 +552,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -688,26 +718,28 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v6.b}[10], [a_ptr6]\n"
                     "ld1 {v7.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
                     "movi v27.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
@@ -730,49 +762,46 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
@@ -788,7 +817,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
                     ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
@@ -802,11 +830,12 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
@@ -832,8 +861,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -849,23 +879,50 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1004,28 +1061,30 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v6.b}[14], [a_ptr6]\n"
                     "ld1 {v7.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1055,50 +1114,47 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
@@ -1114,7 +1170,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
                     ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
@@ -1123,7 +1178,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
                     ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
                     ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
@@ -1137,14 +1191,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1167,8 +1223,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                     ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
                     ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1192,23 +1249,58 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
                     ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
                     ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+                    ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+                    ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+                    ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+                    ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+                    ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+                    ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+                    ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+                    ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+                    ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
+                    ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
+                    ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
+                    ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1287,26 +1379,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
-                    "ldr q8, [a_ptr4], #0x10\n"
-                    "ldr q10, [a_ptr5], #0x10\n"
-                    "ldr q12, [a_ptr6], #0x10\n"
-                    "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr s1, [%[a_ptr0]]\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s3, [a_ptr1]\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s5, [a_ptr2]\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s7, [a_ptr3]\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s9, [a_ptr4]\n"
                     "ldr s11, [a_ptr5]\n"
                     "ldr s13, [a_ptr6]\n"
                     "ldr s15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
                     "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "b.ne 4f\n"
                     "ldr b1, [%[a_ptr0]]\n"
                     "ldr b3, [a_ptr1]\n"
@@ -1339,30 +1439,32 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v13.b}[2], [a_ptr6]\n"
                     "ld1 {v15.b}[2], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
                     "movi v29.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1399,51 +1501,48 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
@@ -1459,7 +1558,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1468,7 +1566,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1477,7 +1574,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1491,14 +1587,17 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
@@ -1521,8 +1620,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1554,23 +1654,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1649,34 +1792,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
                     "ldr q6, [a_ptr3], #0x10\n"
+                    "ldr d1, [%[a_ptr0]]\n"
                     "ldr q8, [a_ptr4], #0x10\n"
+                    "ldr d3, [a_ptr1]\n"
                     "ldr q10, [a_ptr5], #0x10\n"
+                    "ldr d5, [a_ptr2]\n"
                     "ldr q12, [a_ptr6], #0x10\n"
+                    "ldr d7, [a_ptr3]\n"
                     "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
-                    "ldr d1, [%[a_ptr0]]\n"
-                    "ldr d3, [a_ptr1]\n"
-                    "ldr d5, [a_ptr2]\n"
-                    "ldr d7, [a_ptr3]\n"
                     "ldr d9, [a_ptr4]\n"
                     "ldr d11, [a_ptr5]\n"
                     "ldr d13, [a_ptr6]\n"
                     "ldr d15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr s1, [%[a_ptr0]], #0x4\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr s3, [a_ptr1], #0x4\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr s5, [a_ptr2], #0x4\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr s7, [a_ptr3], #0x4\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr s9, [a_ptr4], #0x4\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr s11, [a_ptr5], #0x4\n"
                     "ldr s13, [a_ptr6], #0x4\n"
                     "ldr s15, [a_ptr7], #0x4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[4], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1709,32 +1860,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v13.b}[6], [a_ptr6]\n"
                     "ld1 {v15.b}[6], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
                     "movi v30.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1778,68 +1931,64 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
-                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1848,7 +1997,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1857,7 +2005,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1866,7 +2013,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
@@ -1880,20 +2026,25 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
@@ -1910,7 +2061,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "movi v31.4s, #0\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
@@ -1951,23 +2102,74 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2106,34 +2308,36 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v13.b}[10], [a_ptr6]\n"
                     "ld1 {v15.b}[10], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
                     "movi v31.4s, #0\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2184,34 +2388,31 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
                     "add c_ptr3, c_ptr3, #0x10\n"
@@ -2230,24 +2431,23 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
@@ -2255,7 +2455,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2264,7 +2463,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2273,7 +2471,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2282,7 +2479,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2296,38 +2492,44 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2375,23 +2577,82 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+                    "b 9f\n"
                     "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2471,6 +2732,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "cbnz %[odds], 2f\n"
                     "ldr q0, [%[a_ptr0]], #0x10\n"
                     "ldr q2, [a_ptr1], #0x10\n"
                     "ldr q4, [a_ptr2], #0x10\n"
@@ -2479,7 +2741,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q10, [a_ptr5], #0x10\n"
                     "ldr q12, [a_ptr6], #0x10\n"
                     "ldr q14, [a_ptr7], #0x10\n"
-                    "cbnz %[odds], 2f\n"
                     "ldr q1, [%[a_ptr0]]\n"
                     "ldr q3, [a_ptr1]\n"
                     "ldr q5, [a_ptr2]\n"
@@ -2490,15 +2751,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ldr q15, [a_ptr7]\n"
                     "b 3f\n"
                     "2:\n"
+                    "ldr q0, [%[a_ptr0]], #0x10\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "ldr q2, [a_ptr1], #0x10\n"
+                    "ldr q4, [a_ptr2], #0x10\n"
                     "ldr d1, [%[a_ptr0]], #0x8\n"
+                    "ldr q6, [a_ptr3], #0x10\n"
                     "ldr d3, [a_ptr1], #0x8\n"
+                    "ldr q8, [a_ptr4], #0x10\n"
                     "ldr d5, [a_ptr2], #0x8\n"
+                    "ldr q10, [a_ptr5], #0x10\n"
                     "ldr d7, [a_ptr3], #0x8\n"
+                    "ldr q12, [a_ptr6], #0x10\n"
                     "ldr d9, [a_ptr4], #0x8\n"
+                    "ldr q14, [a_ptr7], #0x10\n"
                     "ldr d11, [a_ptr5], #0x8\n"
                     "ldr d13, [a_ptr6], #0x8\n"
-                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+                    "ldr d15, [a_ptr7], #0x8\n"
                     "ld1 {v3.s}[2], [a_ptr1], #4\n"
                     "ld1 {v5.s}[2], [a_ptr2], #4\n"
                     "ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2506,7 +2776,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v11.s}[2], [a_ptr5], #4\n"
                     "ld1 {v13.s}[2], [a_ptr6], #4\n"
                     "ld1 {v15.s}[2], [a_ptr7], #4\n"
-                    "subs %[odds], %[odds], #0x1\n"
                     "b.ne 4f\n"
                     "ld1 {v1.b}[12], [%[a_ptr0]]\n"
                     "ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2539,36 +2808,38 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "ld1 {v13.b}[14], [a_ptr6]\n"
                     "ld1 {v15.b}[14], [a_ptr7]\n"
                     "3:\n"
-                    "movi v24.4s, #0\n"
                     "ldr q16, [%[b_ptr0]]\n"
-                    "movi v25.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "movi v26.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "movi v27.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "movi v28.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "movi v29.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "movi v30.4s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ldr q22, [%[b_ptr0], #0x60]\n"
-                    "movi v31.4s, #0\n"
                     "ldr q23, [%[b_ptr0], #0x70]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 6f\n"
+                    "movi v24.4s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
                     ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2626,39 +2897,37 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
-                    "cbz %[loops], 6f\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
                     "b.eq 7f\n"
                     "8:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "movi v24.4s, #0\n"
-                    "ldr q23, [%[b_ptr0], #0x70]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                    "str q25, [c_ptr1]\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
-                    "movi v25.4s, #0\n"
+                    "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
+                    "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
                     "add c_ptr4, c_ptr4, #0x10\n"
@@ -2673,32 +2942,29 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
-                    "ldr q16, [%[b_ptr0]]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
-                    "ldr q17, [%[b_ptr0], #0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
                     ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
                     ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
                     ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
                     ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
                     ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
-                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
                     ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
                     ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2707,7 +2973,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
                     ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
                     ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
-                    "ldr q19, [%[b_ptr0], #0x30]\n"
                     ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
                     ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
                     ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2716,7 +2981,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
                     ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
                     ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
-                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
                     ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
                     ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2725,7 +2989,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
                     ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
                     ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
-                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
                     ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
                     ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2734,7 +2997,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
                     ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
                     ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
-                    "ldr q22, [%[b_ptr0], #0x60]\n"
                     ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
                     ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
                     ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
@@ -2748,38 +3010,119 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "movi v24.4s, #0\n"
-                    "ldr q23, [%[b_ptr0], #0x70]\n"
-                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "ldr q16, [%[b_ptr0]]\n"
+                    "ldr q17, [%[b_ptr0], #0x10]\n"
                     "str q25, [c_ptr1]\n"
                     "add c_ptr1, c_ptr1, #0x10\n"
                     "movi v25.4s, #0\n"
+                    "ldr q18, [%[b_ptr0], #0x20]\n"
                     ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
                     "str q26, [c_ptr2]\n"
                     "movi v26.4s, #0\n"
+                    "ldr q19, [%[b_ptr0], #0x30]\n"
+                    "ldr q20, [%[b_ptr0], #0x40]\n"
                     "add c_ptr2, c_ptr2, #0x10\n"
                     ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
                     "str q27, [c_ptr3]\n"
                     "movi v27.4s, #0\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
+                    "ldr q21, [%[b_ptr0], #0x50]\n"
                     ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    "ldr q22, [%[b_ptr0], #0x60]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     "str q28, [c_ptr4]\n"
                     "movi v28.4s, #0\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
+                    "ldr q23, [%[b_ptr0], #0x70]\n"
                     ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    "add c_ptr3, c_ptr3, #0x10\n"
+                    ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     "str q29, [c_ptr5]\n"
                     "movi v29.4s, #0\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
+                    "add c_ptr4, c_ptr4, #0x10\n"
                     ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
                     "str q30, [c_ptr6]\n"
                     "movi v30.4s, #0\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
+                    "add c_ptr5, c_ptr5, #0x10\n"
                     ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     "str q31, [c_ptr7]\n"
                     "movi v31.4s, #0\n"
+                    "add c_ptr6, c_ptr6, #0x10\n"
+                    ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
                     "add c_ptr7, c_ptr7, #0x10\n"
+                    ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+                    ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+                    ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+                    ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+                    ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+                    ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+                    ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+                    ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+                    ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+                    ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+                    ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+                    ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+                    ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+                    ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+                    ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+                    ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+                    ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+                    ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+                    ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+                    ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+                    ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+                    ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+                    ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+                    ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+                    ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+                    ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+                    ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+                    ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+                    ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+                    ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+                    ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+                    ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+                    ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+                    ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+                    ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+                    ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+                    ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+                    ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+                    ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+                    ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+                    ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+                    ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+                    ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+                    ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+                    ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+                    ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+                    ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
+                    ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
+                    ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
+                    ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
+                    ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
+                    ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
+                    ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
+                    ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
+                    "b 9f\n"
+                    "6:\n"
+                    "movi v24.4s, #0\n"
+                    "movi v25.4s, #0\n"
+                    "movi v26.4s, #0\n"
+                    "movi v27.4s, #0\n"
+                    "movi v28.4s, #0\n"
+                    "movi v29.4s, #0\n"
+                    "movi v30.4s, #0\n"
+                    "movi v31.4s, #0\n"
+                    ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+                    ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+                    ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+                    ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+                    ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+                    ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
                     ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
-                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+                    ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
                     ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
                     ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
                     ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2835,23 +3178,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
                     ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
                     ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
                     ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
-                    "6:\n"
+                    "9:\n"
                     "str q24, [%[c_ptr0]]\n"
                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
                     "str q25, [c_ptr1]\n"
-                    "add c_ptr1, c_ptr1, #0x10\n"
                     "str q26, [c_ptr2]\n"
-                    "add c_ptr2, c_ptr2, #0x10\n"
                     "str q27, [c_ptr3]\n"
-                    "add c_ptr3, c_ptr3, #0x10\n"
                     "str q28, [c_ptr4]\n"
-                    "add c_ptr4, c_ptr4, #0x10\n"
                     "str q29, [c_ptr5]\n"
-                    "add c_ptr5, c_ptr5, #0x10\n"
                     "str q30, [c_ptr6]\n"
-                    "add c_ptr6, c_ptr6, #0x10\n"
                     "str q31, [c_ptr7]\n"
-                    "add c_ptr7, c_ptr7, #0x10\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
similarity index 72%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
index 1bc8021e76..57fd9c909e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
@@ -23,34 +23,28 @@
  */
 #pragma once
 
-#ifdef __ARM_FEATURE_SVE
-
+#ifdef __aarch64__
 
+#include "../performance_parameters.hpp"
 #include "../std_transforms_sve.hpp"
 
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_hybrid_fp32_mla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_gemv_fp32_mla_8VL(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
 
-class hybrid_fp32_mla_4VLx4
+class cls_sve_gemv_fp32_mla_8VL
 {
 public:
     typedef float operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 4;
-    }
+    typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
 
     static unsigned int out_width()
     {
-        return get_vector_length<float>() * 4;
+        return 8 * get_vector_length<float>();
     }
 
     static constexpr unsigned int k_unroll()
@@ -60,7 +54,7 @@ class hybrid_fp32_mla_4VLx4
 
     static constexpr bool supports_accumulate()
     {
-        return true;
+        return false;
     }
 
     static constexpr bool supports_bias()
@@ -73,17 +67,16 @@ class hybrid_fp32_mla_4VLx4
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 1, 8, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_hybrid_fp32_mla_4VLx4;
+    kern_type kernel=sve_gemv_fp32_mla_8VL;
 
-    hybrid_fp32_mla_4VLx4(const CPUInfo *)
+    cls_sve_gemv_fp32_mla_8VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
-#endif // __ARM_FEATURE_SVE
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp
new file mode 100644
index 0000000000..c62e31936c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp
@@ -0,0 +1,1372 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_gemv_fp32_mla_8VL (
+    const float *A_ptr, const float *B_ptr, float *output_ptr,
+    size_t N, size_t K,
+    const float *bias, Activation act, bool
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        unsigned int input_initial_col = {};
+    } ka;
+
+    unsigned long flags=0;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "cntw x24\n"
+      "add x23, %x[N], x24\n"
+      "sub x23, x23, #0x1\n"
+      "udiv x23, x23, x24\n"
+      "mov x22, %x[bias]\n"
+      "1:"  // Column loop
+      "cmp x23, #0x8\n"
+      "bge 50f\n"
+      "cmp x23, #0x6\n"
+      "bgt 43f\n"
+      "beq 36f\n"
+      "cmp x23, #0x4\n"
+      "bgt 29f\n"
+      "beq 22f\n"
+      "cmp x23, #0x2\n"
+      "bgt 15f\n"
+      "beq 8f\n"
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "whilelt p1.s, XZR, %x[N]\n"
+      "cbz x22, 2f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "addvl x22, x22, #1\n"
+      "b 3f\n"
+      "2:"  // Width 1: no bias
+      "mov z24.b, #0x0\n"
+      "3:"  // Width 1: setup done
+      "cmp x21, #0x4\n"
+      "ble 5f\n"
+      "4:"  // Width 1: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x20, x20, #0x10\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z2.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "cmp x21, #0x4\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z3.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z4.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 4b\n"
+      "5:"  // Width 1: Multiply loop: Single iteration only
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z5.s, z0.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "subs x21, x21, #0x1\n"
+      "ble 6f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z6.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "ble 6f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z7.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "ble 6f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z8.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "6:"  // Width 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 7f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "7:"  // Width 1: No activation
+      "st1w { z24.s }, p1, [%x[output_ptr]]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #1\n"
+      "b 57f\n"
+      "8:"  // Width 2
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "sub x19, %x[N], x24\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 9f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "addvl x22, x22, #2\n"
+      "b 10f\n"
+      "9:"  // Width 2: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "10:"  // Width 2: setup done
+      "cmp x21, #0x4\n"
+      "ble 12f\n"
+      "11:"  // Width 2: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z3.s, z0.s[1]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z25.s, z4.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "cmp x21, #0x4\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z5.s, z0.s[2]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z25.s, z6.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z7.s, z0.s[3]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z25.s, z8.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 11b\n"
+      "12:"  // Width 2: Multiply loop: Single iteration only
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z9.s, z0.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z10.s, z0.s[0]\n"
+      "subs x21, x21, #0x1\n"
+      "ble 13f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z11.s, z0.s[1]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z25.s, z12.s, z0.s[1]\n"
+      "ble 13f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z13.s, z0.s[2]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z25.s, z14.s, z0.s[2]\n"
+      "ble 13f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z25.s, z16.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "13:"  // Width 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 14f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "14:"  // Width 2: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "st1w { z25.s }, p1, [%x[output_ptr], #1, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #2\n"
+      "b 57f\n"
+      "15:"  // Width 3
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "mov x19, #0x2\n"
+      "msub x19, x24, x19, %x[N]\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 16f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "addvl x22, x22, #3\n"
+      "b 17f\n"
+      "16:"  // Width 3: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "17:"  // Width 3: setup done
+      "cmp x21, #0x4\n"
+      "ble 19f\n"
+      "18:"  // Width 3: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z26.s, z3.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "cmp x21, #0x4\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z4.s, z0.s[1]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z5.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z26.s, z6.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z7.s, z0.s[2]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z8.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z26.s, z9.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z10.s, z0.s[3]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z11.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z26.s, z12.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 18b\n"
+      "19:"  // Width 3: Multiply loop: Single iteration only
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z13.s, z0.s[0]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z14.s, z0.s[0]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z15.s, z0.s[0]\n"
+      "ble 20f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z16.s, z0.s[1]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z17.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z18.s, z0.s[1]\n"
+      "ble 20f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z19.s, z0.s[2]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z20.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z21.s, z0.s[2]\n"
+      "ble 20f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z22.s, z0.s[3]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z23.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z26.s, z1.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "20:"  // Width 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 21f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "21:"  // Width 3: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [%x[output_ptr], #2, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #3\n"
+      "b 57f\n"
+      "22:"  // Width 4
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "mov x19, #0x3\n"
+      "msub x19, x24, x19, %x[N]\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 23f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+      "addvl x22, x22, #4\n"
+      "b 24f\n"
+      "23:"  // Width 4: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "24:"  // Width 4: setup done
+      "cmp x21, #0x4\n"
+      "ble 26f\n"
+      "25:"  // Width 4: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z3.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "cmp x21, #0x4\n"
+      "fmla z27.s, z4.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z5.s, z0.s[1]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z6.s, z0.s[1]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z7.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z27.s, z8.s, z0.s[1]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z9.s, z0.s[2]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z10.s, z0.s[2]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z11.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z27.s, z12.s, z0.s[2]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z13.s, z0.s[3]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z14.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z15.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z27.s, z16.s, z0.s[3]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 25b\n"
+      "26:"  // Width 4: Multiply loop: Single iteration only
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z17.s, z0.s[0]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z18.s, z0.s[0]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z19.s, z0.s[0]\n"
+      "fmla z27.s, z20.s, z0.s[0]\n"
+      "ble 27f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z21.s, z0.s[1]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z22.s, z0.s[1]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z23.s, z0.s[1]\n"
+      "fmla z27.s, z1.s, z0.s[1]\n"
+      "ble 27f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z2.s, z0.s[2]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z3.s, z0.s[2]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z4.s, z0.s[2]\n"
+      "fmla z27.s, z5.s, z0.s[2]\n"
+      "ble 27f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z7.s, z0.s[3]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z26.s, z8.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z27.s, z9.s, z0.s[3]\n"
+      "27:"  // Width 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 28f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "28:"  // Width 4: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [%x[output_ptr], #3, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #4\n"
+      "b 57f\n"
+      "29:"  // Width 5
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "mov x19, #0x4\n"
+      "msub x19, x24, x19, %x[N]\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 30f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "addvl x22, x22, #5\n"
+      "b 31f\n"
+      "30:"  // Width 5: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "31:"  // Width 5: setup done
+      "cmp x21, #0x4\n"
+      "ble 33f\n"
+      "32:"  // Width 5: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "cmp x21, #0x4\n"
+      "fmla z26.s, z3.s, z0.s[0]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z4.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z28.s, z5.s, z0.s[0]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z6.s, z0.s[1]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z7.s, z0.s[1]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z8.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z27.s, z9.s, z0.s[1]\n"
+      "fmla z28.s, z10.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z11.s, z0.s[2]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z12.s, z0.s[2]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z13.s, z0.s[2]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z14.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z28.s, z15.s, z0.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z16.s, z0.s[3]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z17.s, z0.s[3]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z18.s, z0.s[3]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z19.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z28.s, z20.s, z0.s[3]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 32b\n"
+      "33:"  // Width 5: Multiply loop: Single iteration only
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z21.s, z0.s[0]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z22.s, z0.s[0]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z23.s, z0.s[0]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z1.s, z0.s[0]\n"
+      "fmla z28.s, z2.s, z0.s[0]\n"
+      "ble 34f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z3.s, z0.s[1]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z4.s, z0.s[1]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z5.s, z0.s[1]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z6.s, z0.s[1]\n"
+      "fmla z28.s, z7.s, z0.s[1]\n"
+      "ble 34f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z8.s, z0.s[2]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z9.s, z0.s[2]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z10.s, z0.s[2]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z11.s, z0.s[2]\n"
+      "fmla z28.s, z12.s, z0.s[2]\n"
+      "ble 34f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z13.s, z0.s[3]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z14.s, z0.s[3]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z15.s, z0.s[3]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z16.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z28.s, z17.s, z0.s[3]\n"
+      "34:"  // Width 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 35f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "35:"  // Width 5: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+      "st1w { z28.s }, p1, [%x[output_ptr], #4, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #5\n"
+      "b 57f\n"
+      "36:"  // Width 6
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "mov x19, #0x5\n"
+      "msub x19, x24, x19, %x[N]\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 37f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
+      "addvl x22, x22, #6\n"
+      "b 38f\n"
+      "37:"  // Width 6: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "38:"  // Width 6: setup done
+      "cmp x21, #0x4\n"
+      "ble 40f\n"
+      "39:"  // Width 6: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "cmp x21, #0x4\n"
+      "fmla z26.s, z3.s, z0.s[0]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z4.s, z0.s[0]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z5.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z29.s, z6.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z7.s, z0.s[1]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z8.s, z0.s[1]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z9.s, z0.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z10.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z28.s, z11.s, z0.s[1]\n"
+      "fmla z29.s, z12.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z13.s, z0.s[2]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z14.s, z0.s[2]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z15.s, z0.s[2]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z16.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z28.s, z17.s, z0.s[2]\n"
+      "fmla z29.s, z18.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z19.s, z0.s[3]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z20.s, z0.s[3]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z21.s, z0.s[3]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z22.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z28.s, z23.s, z0.s[3]\n"
+      "fmla z29.s, z1.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 39b\n"
+      "40:"  // Width 6: Multiply loop: Single iteration only
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z2.s, z0.s[0]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z3.s, z0.s[0]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z4.s, z0.s[0]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z27.s, z5.s, z0.s[0]\n"
+      "fmla z28.s, z6.s, z0.s[0]\n"
+      "fmla z29.s, z7.s, z0.s[0]\n"
+      "ble 41f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z8.s, z0.s[1]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z9.s, z0.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z10.s, z0.s[1]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z11.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z12.s, z0.s[1]\n"
+      "fmla z29.s, z13.s, z0.s[1]\n"
+      "ble 41f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z15.s, z0.s[2]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z16.s, z0.s[2]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z17.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z18.s, z0.s[2]\n"
+      "fmla z29.s, z19.s, z0.s[2]\n"
+      "ble 41f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z20.s, z0.s[3]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z21.s, z0.s[3]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z22.s, z0.s[3]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z23.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z28.s, z1.s, z0.s[3]\n"
+      "fmla z29.s, z2.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "41:"  // Width 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 42f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "42:"  // Width 6: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
+      "st1w { z29.s }, p1, [%x[output_ptr], #5, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #6\n"
+      "b 57f\n"
+      "43:"  // Width 7
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "mov x19, #0x6\n"
+      "msub x19, x24, x19, %x[N]\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 44f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
+      "addvl x22, x22, #7\n"
+      "b 45f\n"
+      "44:"  // Width 7: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "45:"  // Width 7: setup done
+      "cmp x21, #0x4\n"
+      "ble 47f\n"
+      "46:"  // Width 7: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "cmp x21, #0x4\n"
+      "fmla z26.s, z3.s, z0.s[0]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z4.s, z0.s[0]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z28.s, z5.s, z0.s[0]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z29.s, z6.s, z0.s[0]\n"
+      "fmla z30.s, z7.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z8.s, z0.s[1]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z9.s, z0.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z10.s, z0.s[1]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z11.s, z0.s[1]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z12.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z29.s, z13.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z30.s, z14.s, z0.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z15.s, z0.s[2]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z25.s, z16.s, z0.s[2]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z17.s, z0.s[2]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z18.s, z0.s[2]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z19.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z29.s, z20.s, z0.s[2]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z30.s, z21.s, z0.s[2]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z22.s, z0.s[3]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z25.s, z23.s, z0.s[3]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z1.s, z0.s[3]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z2.s, z0.s[3]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z3.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z29.s, z4.s, z0.s[3]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla z30.s, z5.s, z0.s[3]\n"
+      "bgt 46b\n"
+      "47:"  // Width 7: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z6.s, z0.s[0]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z7.s, z0.s[0]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z8.s, z0.s[0]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z27.s, z9.s, z0.s[0]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z10.s, z0.s[0]\n"
+      "fmla z29.s, z11.s, z0.s[0]\n"
+      "fmla z30.s, z12.s, z0.s[0]\n"
+      "ble 48f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z14.s, z0.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z15.s, z0.s[1]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z16.s, z0.s[1]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z17.s, z0.s[1]\n"
+      "fmla z29.s, z18.s, z0.s[1]\n"
+      "fmla z30.s, z19.s, z0.s[1]\n"
+      "ble 48f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z20.s, z0.s[2]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z21.s, z0.s[2]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z22.s, z0.s[2]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z23.s, z0.s[2]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z1.s, z0.s[2]\n"
+      "fmla z29.s, z2.s, z0.s[2]\n"
+      "fmla z30.s, z3.s, z0.s[2]\n"
+      "ble 48f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z4.s, z0.s[3]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z5.s, z0.s[3]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z6.s, z0.s[3]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z7.s, z0.s[3]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z28.s, z8.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z29.s, z9.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z30.s, z10.s, z0.s[3]\n"
+      "48:"  // Width 7: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 49f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z16.s\n"
+      "fmin z30.s, p2/M, z30.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "fmax z30.s, p2/M, z30.s, z17.s\n"
+      "49:"  // Width 7: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
+      "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
+      "st1w { z30.s }, p1, [%x[output_ptr], #6, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #7\n"
+      "b 57f\n"
+      "50:"  // Width 8
+      "mov x21, %x[K]\n"
+      "mov x20, %x[A_ptr]\n"
+      "mov x19, #0x7\n"
+      "msub x19, x24, x19, %x[N]\n"
+      "whilelt p1.s, XZR, x19\n"
+      "cbz x22, 51f\n"
+      "ld1w { z24.s }, p2/Z, [x22]\n"
+      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
+      "ld1w { z31.s }, p2/Z, [x22, #7, MUL VL]\n"
+      "addvl x22, x22, #8\n"
+      "b 52f\n"
+      "51:"  // Width 8: no bias
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "52:"  // Width 8: setup done
+      "cmp x21, #0x4\n"
+      "ble 54f\n"
+      "53:"  // Width 8: Multiply loop: Main loop head
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "sub x21, x21, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z1.s, z0.s[0]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z2.s, z0.s[0]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "cmp x21, #0x4\n"
+      "fmla z26.s, z3.s, z0.s[0]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z4.s, z0.s[0]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z28.s, z5.s, z0.s[0]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z6.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "fmla z30.s, z7.s, z0.s[0]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z31.s, z8.s, z0.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z25.s, z10.s, z0.s[1]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z11.s, z0.s[1]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z12.s, z0.s[1]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z28.s, z13.s, z0.s[1]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z14.s, z0.s[1]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z30.s, z15.s, z0.s[1]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z31.s, z16.s, z0.s[1]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z17.s, z0.s[2]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z25.s, z18.s, z0.s[2]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z19.s, z0.s[2]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z20.s, z0.s[2]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z28.s, z21.s, z0.s[2]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z22.s, z0.s[2]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z30.s, z23.s, z0.s[2]\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z31.s, z1.s, z0.s[2]\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "fmla z24.s, z2.s, z0.s[3]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z25.s, z3.s, z0.s[3]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z4.s, z0.s[3]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z5.s, z0.s[3]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z28.s, z6.s, z0.s[3]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z7.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z30.s, z8.s, z0.s[3]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla z31.s, z9.s, z0.s[3]\n"
+      "bgt 53b\n"
+      "54:"  // Width 8: Multiply loop: Single iteration only
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
+      "whilelt p0.s, XZR, x21\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x20]\n"
+      "fmla z24.s, z10.s, z0.s[0]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z25.s, z11.s, z0.s[0]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "fmla z26.s, z12.s, z0.s[0]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "fmla z27.s, z13.s, z0.s[0]\n"
+      "fmla z28.s, z14.s, z0.s[0]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z15.s, z0.s[0]\n"
+      "fmla z30.s, z16.s, z0.s[0]\n"
+      "fmla z31.s, z17.s, z0.s[0]\n"
+      "ble 55f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z18.s, z0.s[1]\n"
+      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z19.s, z0.s[1]\n"
+      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z20.s, z0.s[1]\n"
+      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z21.s, z0.s[1]\n"
+      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "fmla z28.s, z22.s, z0.s[1]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z23.s, z0.s[1]\n"
+      "fmla z30.s, z1.s, z0.s[1]\n"
+      "fmla z31.s, z2.s, z0.s[1]\n"
+      "ble 55f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "subs x21, x21, #0x1\n"
+      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z3.s, z0.s[2]\n"
+      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z4.s, z0.s[2]\n"
+      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z5.s, z0.s[2]\n"
+      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z6.s, z0.s[2]\n"
+      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "fmla z28.s, z7.s, z0.s[2]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z8.s, z0.s[2]\n"
+      "fmla z30.s, z9.s, z0.s[2]\n"
+      "fmla z31.s, z10.s, z0.s[2]\n"
+      "ble 55f\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+      "fmla z25.s, z12.s, z0.s[3]\n"
+      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+      "fmla z26.s, z13.s, z0.s[3]\n"
+      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+      "fmla z27.s, z14.s, z0.s[3]\n"
+      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+      "fmla z28.s, z15.s, z0.s[3]\n"
+      "addvl %x[B_ptr], %x[B_ptr], #8\n"
+      "fmla z29.s, z16.s, z0.s[3]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+      "fmla z30.s, z17.s, z0.s[3]\n"
+      "fmla z31.s, z18.s, z0.s[3]\n"
+      "55:"  // Width 8: Multiply loop: multiply skip
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+      "tbz %x[flags], #1, 56f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z16.s\n"
+      "fmin z30.s, p2/M, z30.s, z16.s\n"
+      "fmin z31.s, p2/M, z31.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "fmax z30.s, p2/M, z30.s, z17.s\n"
+      "fmax z31.s, p2/M, z31.s, z17.s\n"
+      "56:"  // Width 8: No activation
+      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+      "subs x23, x23, #0x8\n"
+      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+      "sub %x[N], %x[N], x24, LSL #3\n"
+      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
+      "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
+      "st1w { z30.s }, p2, [%x[output_ptr], #6, MUL VL]\n"
+      "st1w { z31.s }, p1, [%x[output_ptr], #7, MUL VL]\n"
+      "addvl %x[output_ptr], %x[output_ptr], #8\n"
+      "bgt 1b\n"
+      "57:"  // Exit
+
+      : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
+      : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
+      : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
deleted file mode 100644
index 385a16fe10..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2247 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = ((K + 1) / 2) * 2;
-    const long loops_count = ((K + 8) / 16) - 1;
-    K -= loops_count * 16;
-    const long regs_count = (K / 8) - 1;
-    K -= (regs_count + 1) * 8;
-    const long leftovers = K;
-    const long blocks_count = (K + 1) / 2;
-    float nullbias[256];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const bfloat16 * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(bfloat16);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const bfloat16 *a_ptr0 = a_ptr0_base;
-            const bfloat16 *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z28.d, z16.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z29.d, z17.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z30.d, z18.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z31.d, z19.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1w z28.s, p0/z, [c_ptr3]\n"
-                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z28.s, p0, [c_ptr3]\n"
-                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
similarity index 65%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
index eba98bb74d..e344d82dc6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,42 +10,49 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __ARM_FEATURE_SVE
 
-#include "../bfloat.hpp"
 #include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<bfloat16>, \
+   size_t, size_t, \
+   const bfloat16 *, \
+   IndirectOutputArg<float>, \
+   const float *, Activation, bool
 
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_hybrid_bf16fp32_dot_6x4VL( ARGLIST );
 
-class hybrid_bf16fp32_dot_4VLx4
+class cls_sve_hybrid_bf16fp32_dot_6x4VL
 {
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
     {
-        return 4;
+        return 6;
     }
 
     static unsigned int out_width()
@@ -63,27 +70,17 @@ class hybrid_bf16fp32_dot_4VLx4
         return true;
     }
 
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 4, 4, 2> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 6, 4, 2> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_hybrid_bf16fp32_dot_4VLx4;
+    kern_type kernel=sve_hybrid_bf16fp32_dot_6x4VL;
 
-    hybrid_bf16fp32_dot_4VLx4(const CPUInfo *)
+    cls_sve_hybrid_bf16fp32_dot_6x4VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..19385e56ea
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
@@ -0,0 +1,2237 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_bf16fp32_dot_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 4f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 9f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "cmp x11, #0x8\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "cmp x11, #0x8\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      "12:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 7b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 13f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "13:"  // Height 1: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "14:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 3b\n"
+      "b 86f\n"
+      "15:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 16f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "17:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 18f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "b 20f\n"
+      "18:"  // Height 2: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "b 20f\n"
+      "19:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "20:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "21:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 23f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "b 23f\n"
+      "22:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "23:"  // Height 2: input setup done
+      "cmp x11, #0x8\n"
+      "ble 25f\n"
+      "24:"  // Height 2: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "cmp x11, #0x8\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      "bgt 24b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 21b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "27:"  // Height 2: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "28:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 17b\n"
+      "b 86f\n"
+      "29:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 30f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 31f\n"
+      "30:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "31:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 32f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "mov z13.d, z9.d\n"
+      "addvl x14, x14, #4\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 34f\n"
+      "32:"  // Height 3: no bias
+      "tbz %x[flags], #0, 33f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "b 34f\n"
+      "33:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "37:"  // Height 3: input setup done
+      "cmp x11, #0x8\n"
+      "ble 39f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "cmp x11, #0x8\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      "bgt 38b\n"
+      "39:"  // Height 3: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      "40:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 35b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 41f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "41:"  // Height 3: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "42:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 31b\n"
+      "b 86f\n"
+      "43:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 44f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 45f\n"
+      "44:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "45:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 46f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 48f\n"
+      "46:"  // Height 4: no bias
+      "tbz %x[flags], #0, 47f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "b 48f\n"
+      "47:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "48:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "49:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 51f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "b 51f\n"
+      "50:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "51:"  // Height 4: input setup done
+      "cmp x11, #0x8\n"
+      "ble 53f\n"
+      "52:"  // Height 4: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x8\n"
+      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      "bgt 52b\n"
+      "53:"  // Height 4: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      "54:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 49b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 55f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "55:"  // Height 4: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "56:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 45b\n"
+      "b 86f\n"
+      "57:"  // Height 5
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 58f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 59f\n"
+      "58:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "59:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 60f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 62f\n"
+      "60:"  // Height 5: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "b 62f\n"
+      "61:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "62:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "63:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 65f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "b 65f\n"
+      "64:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "65:"  // Height 5: input setup done
+      "cmp x11, #0x8\n"
+      "ble 67f\n"
+      "66:"  // Height 5: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x8\n"
+      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
+      "bgt 66b\n"
+      "67:"  // Height 5: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
+      "68:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 63b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "fmax z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z1.s\n"
+      "fmax z26.s, p5/M, z26.s, z1.s\n"
+      "fmax z27.s, p5/M, z27.s, z1.s\n"
+      "69:"  // Height 5: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "70:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 59b\n"
+      "b 86f\n"
+      "71:"  // Height 6
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "73:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 74f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 76f\n"
+      "74:"  // Height 6: no bias
+      "tbz %x[flags], #0, 75f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x21]\n"
+      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "b 76f\n"
+      "75:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "76:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "77:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 78f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 79f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x20, x20, x19, LSL #1\n"
+      "b 79f\n"
+      "78:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "add x20, x22, x19, LSL #1\n"
+      "79:"  // Height 6: input setup done
+      "cmp x11, #0x8\n"
+      "ble 81f\n"
+      "80:"  // Height 6: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "ld1rqh { z5.h }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x8\n"
+      ".inst 0x646540dc  // bfdot z28.s, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x646540fd  // bfdot z29.s, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
+      ".inst 0x646540de  // bfdot z30.s, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
+      ".inst 0x646540ff  // bfdot z31.s, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
+      ".inst 0x646d40dc  // bfdot z28.s, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
+      ".inst 0x646d40fd  // bfdot z29.s, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
+      ".inst 0x646d40de  // bfdot z30.s, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
+      ".inst 0x646d40ff  // bfdot z31.s, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
+      ".inst 0x647540dc  // bfdot z28.s, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
+      ".inst 0x647540fd  // bfdot z29.s, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
+      ".inst 0x647540de  // bfdot z30.s, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
+      ".inst 0x647540ff  // bfdot z31.s, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
+      ".inst 0x647d40dc  // bfdot z28.s, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
+      ".inst 0x647d40fd  // bfdot z29.s, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
+      ".inst 0x647d40de  // bfdot z30.s, z6.h, z5.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
+      ".inst 0x647d40ff  // bfdot z31.s, z7.h, z5.h[3]\n"
+      "bgt 80b\n"
+      "81:"  // Height 6: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
+      "ld1rqh { z5.h }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
+      ".inst 0x646540dc  // bfdot z28.s, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
+      ".inst 0x646540fd  // bfdot z29.s, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
+      ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      ".inst 0x646440da  // bfdot z26.s, z6.h, z4.h[0]\n"
+      ".inst 0x646540de  // bfdot z30.s, z6.h, z5.h[0]\n"
+      ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
+      ".inst 0x646140ef  // bfdot z15.s, z7.h, z1.h[0]\n"
+      ".inst 0x646240f3  // bfdot z19.s, z7.h, z2.h[0]\n"
+      ".inst 0x646340f7  // bfdot z23.s, z7.h, z3.h[0]\n"
+      ".inst 0x646440fb  // bfdot z27.s, z7.h, z4.h[0]\n"
+      ".inst 0x646540ff  // bfdot z31.s, z7.h, z5.h[0]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x646940cc  // bfdot z12.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d0  // bfdot z16.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d4  // bfdot z20.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40d8  // bfdot z24.s, z6.h, z4.h[1]\n"
+      ".inst 0x646d40dc  // bfdot z28.s, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x646840e9  // bfdot z9.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ed  // bfdot z13.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f1  // bfdot z17.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f5  // bfdot z21.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40f9  // bfdot z25.s, z7.h, z4.h[1]\n"
+      ".inst 0x646d40fd  // bfdot z29.s, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x646840ca  // bfdot z10.s, z6.h, z0.h[1]\n"
+      ".inst 0x646940ce  // bfdot z14.s, z6.h, z1.h[1]\n"
+      ".inst 0x646a40d2  // bfdot z18.s, z6.h, z2.h[1]\n"
+      ".inst 0x646b40d6  // bfdot z22.s, z6.h, z3.h[1]\n"
+      ".inst 0x646c40da  // bfdot z26.s, z6.h, z4.h[1]\n"
+      ".inst 0x646d40de  // bfdot z30.s, z6.h, z5.h[1]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x646940ef  // bfdot z15.s, z7.h, z1.h[1]\n"
+      ".inst 0x646a40f3  // bfdot z19.s, z7.h, z2.h[1]\n"
+      ".inst 0x646b40f7  // bfdot z23.s, z7.h, z3.h[1]\n"
+      ".inst 0x646c40fb  // bfdot z27.s, z7.h, z4.h[1]\n"
+      ".inst 0x646d40ff  // bfdot z31.s, z7.h, z5.h[1]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647040c8  // bfdot z8.s, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x2\n"
+      ".inst 0x647140cc  // bfdot z12.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d4  // bfdot z20.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440d8  // bfdot z24.s, z6.h, z4.h[2]\n"
+      ".inst 0x647540dc  // bfdot z28.s, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647040e9  // bfdot z9.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ed  // bfdot z13.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f1  // bfdot z17.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f5  // bfdot z21.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440f9  // bfdot z25.s, z7.h, z4.h[2]\n"
+      ".inst 0x647540fd  // bfdot z29.s, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647040ca  // bfdot z10.s, z6.h, z0.h[2]\n"
+      ".inst 0x647140ce  // bfdot z14.s, z6.h, z1.h[2]\n"
+      ".inst 0x647240d2  // bfdot z18.s, z6.h, z2.h[2]\n"
+      ".inst 0x647340d6  // bfdot z22.s, z6.h, z3.h[2]\n"
+      ".inst 0x647440da  // bfdot z26.s, z6.h, z4.h[2]\n"
+      ".inst 0x647540de  // bfdot z30.s, z6.h, z5.h[2]\n"
+      ".inst 0x647040eb  // bfdot z11.s, z7.h, z0.h[2]\n"
+      ".inst 0x647140ef  // bfdot z15.s, z7.h, z1.h[2]\n"
+      ".inst 0x647240f3  // bfdot z19.s, z7.h, z2.h[2]\n"
+      ".inst 0x647340f7  // bfdot z23.s, z7.h, z3.h[2]\n"
+      ".inst 0x647440fb  // bfdot z27.s, z7.h, z4.h[2]\n"
+      ".inst 0x647540ff  // bfdot z31.s, z7.h, z5.h[2]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      ".inst 0x647840c8  // bfdot z8.s, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      ".inst 0x647940cc  // bfdot z12.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d0  // bfdot z16.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d4  // bfdot z20.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40d8  // bfdot z24.s, z6.h, z4.h[3]\n"
+      ".inst 0x647d40dc  // bfdot z28.s, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      ".inst 0x647840e9  // bfdot z9.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ed  // bfdot z13.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f1  // bfdot z17.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f5  // bfdot z21.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40f9  // bfdot z25.s, z7.h, z4.h[3]\n"
+      ".inst 0x647d40fd  // bfdot z29.s, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
+      ".inst 0x647940ce  // bfdot z14.s, z6.h, z1.h[3]\n"
+      ".inst 0x647a40d2  // bfdot z18.s, z6.h, z2.h[3]\n"
+      ".inst 0x647b40d6  // bfdot z22.s, z6.h, z3.h[3]\n"
+      ".inst 0x647c40da  // bfdot z26.s, z6.h, z4.h[3]\n"
+      ".inst 0x647d40de  // bfdot z30.s, z6.h, z5.h[3]\n"
+      ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
+      ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
+      ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
+      ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
+      ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
+      ".inst 0x647d40ff  // bfdot z31.s, z7.h, z5.h[3]\n"
+      "82:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 77b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "fmax z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmin z28.s, p5/M, z28.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z1.s\n"
+      "fmax z26.s, p5/M, z26.s, z1.s\n"
+      "fmax z27.s, p5/M, z27.s, z1.s\n"
+      "fmax z28.s, p5/M, z28.s, z1.s\n"
+      "fmin z29.s, p5/M, z29.s, z0.s\n"
+      "fmin z30.s, p5/M, z30.s, z0.s\n"
+      "fmin z31.s, p5/M, z31.s, z0.s\n"
+      "fmax z29.s, p5/M, z29.s, z1.s\n"
+      "fmax z30.s, p5/M, z30.s, z1.s\n"
+      "fmax z31.s, p5/M, z31.s, z1.s\n"
+      "83:"  // Height 6: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "st1w { z28.s }, p4, [x21]\n"
+      "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+      "addvl x21, x21, #4\n"
+      "84:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 73b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 86f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "85:"  // Update direct input
+      "mov x19, #0xc\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "86:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
deleted file mode 100644
index 641e5c12fd..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-class hybrid_bf16fp32_mmla_4VLx4
-{
-public:
-    typedef bfloat16 operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 8;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<float>() * 2;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 4;
-    }
-
-    static constexpr bool supports_accumulate()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
-
-    // Default to the generic kernel
-    kern_type kernel=sve_hybrid_bf16fp32_mmla_4VLx4;
-
-    hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *)
-    {
-
-    }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
deleted file mode 100644
index 76e3546c6f..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,3459 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 8) / 16) - 1;
-    K -= loops_count * 16;
-    const long regs_count = (K / 8) - 1;
-    K -= (regs_count + 1) * 8;
-    const long leftovers = K;
-    const long blocks_count = (K + 3) / 4;
-    float nullbias[128];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (2 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const bfloat16 * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(bfloat16);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 8) {
-            if (rows_to_compute % 8) {
-                rows_to_compute = 8 - 1;
-            } else {
-                rows_to_compute = 8;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const bfloat16 *a_ptr0 = a_ptr0_base;
-            const bfloat16 *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z1.h, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z14.s, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "mov z1.h, #0\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "mov z1.h, #0\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "mov z1.h, #0\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp1 z1.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z3.h, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z20.d, z16.d\n"
-                        "mov z21.d, z17.d\n"
-                        "mov z22.d, z18.d\n"
-                        "mov z23.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z3.h, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z7.h, #0\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z7.h, #0\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z7.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp1 z5.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z21.d, z17.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z22.d, z18.d\n"
-                        "mov z23.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-                case 5:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "c_ptr1 .req X4\n"
-                        "c_ptr2 .req X5\n"
-                        "c_ptr3 .req X6\n"
-                        "c_ptr4 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z5.h, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z21.d, z17.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z23.d, z19.d\n"
-                        "mov z24.d, z16.d\n"
-                        "mov z25.d, z17.d\n"
-                        "mov z26.d, z18.d\n"
-                        "mov z27.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z5.h, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z9.h, #0\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z9.h, #0\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p6/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z9.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp1 z9.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
-                    );
-                    break;
-                case 6:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "c_ptr1 .req X5\n"
-                        "c_ptr2 .req X6\n"
-                        "c_ptr3 .req X7\n"
-                        "c_ptr4 .req X8\n"
-                        "c_ptr5 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z21.d, z17.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "mov z24.d, z16.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z25.d, z17.d\n"
-                        "mov z26.d, z18.d\n"
-                        "mov z27.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "ld1w z14.s, p0/z, [c_ptr5]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p7/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p7/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl a_ptr5, a_ptr5, #2\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p6/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p6/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "addvl a_ptr5, a_ptr5, #1\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "st1w z9.s, p0, [c_ptr5]\n"
-                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-                    );
-                    break;
-                case 7:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "a_ptr6 .req X5\n"
-                        "c_ptr1 .req X6\n"
-                        "c_ptr2 .req X7\n"
-                        "c_ptr3 .req X8\n"
-                        "c_ptr4 .req X9\n"
-                        "c_ptr5 .req X10\n"
-                        "c_ptr6 .req X11\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add a_ptr6, a_ptr5, %[lda]\n"
-                        "add c_ptr6, c_ptr5, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z7.h, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr6]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z21.d, z17.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "mov z24.d, z16.d\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "mov z25.d, z17.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z26.d, z18.d\n"
-                        "mov z27.d, z19.d\n"
-                        "mov z28.d, z16.d\n"
-                        "mov z29.d, z17.d\n"
-                        "mov z30.d, z18.d\n"
-                        "mov z31.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z7.h, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr6]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "ld1w z14.s, p0/z, [c_ptr5]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr6]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z28.s, z13.s, z14.s\n"
-                        "zip2 z29.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z30.s, z13.s, z14.s\n"
-                        "zip2 z31.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p7/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1rqh z10.h, p7/z, [a_ptr6]\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z11.h, #0\n"
-                        "add a_ptr6, a_ptr6, #0x20\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr6, #-0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z7.h, #0\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p7/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1rqh z10.h, p7/z, [a_ptr6]\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z11.h, #0\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr6, #0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl a_ptr5, a_ptr5, #2\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "addvl a_ptr6, a_ptr6, #2\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z7.h, #0\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p6/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p6/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "addvl a_ptr5, a_ptr5, #1\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1rqh z10.h, p6/z, [a_ptr6]\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z11.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "addvl a_ptr6, a_ptr6, #1\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "st1w z9.s, p0, [c_ptr5]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "uzp1 z12.s, z28.s, z29.s\n"
-                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
-                        "uzp1 z13.s, z30.s, z31.s\n"
-                        "st1w z12.s, p0, [c_ptr6]\n"
-                        "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq a_ptr6\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        ".unreq c_ptr6\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 8:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "a_ptr6 .req X5\n"
-                        "a_ptr7 .req X6\n"
-                        "c_ptr1 .req X7\n"
-                        "c_ptr2 .req X8\n"
-                        "c_ptr3 .req X9\n"
-                        "c_ptr4 .req X10\n"
-                        "c_ptr5 .req X11\n"
-                        "c_ptr6 .req X12\n"
-                        "c_ptr7 .req X13\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add a_ptr6, a_ptr5, %[lda]\n"
-                        "add c_ptr6, c_ptr5, %[ldc]\n"
-                        "add a_ptr7, a_ptr6, %[lda]\n"
-                        "add c_ptr7, c_ptr6, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr6]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr7]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z24.d, z16.d\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "mov z25.d, z17.d\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "mov z26.d, z18.d\n"
-                        "add a_ptr7, a_ptr7, #0x10\n"
-                        "mov z27.d, z19.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z28.d, z16.d\n"
-                        "mov z29.d, z17.d\n"
-                        "mov z30.d, z18.d\n"
-                        "mov z31.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr6]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr7]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "add a_ptr7, a_ptr7, #0x10\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "ld1w z14.s, p0/z, [c_ptr5]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr6]\n"
-                        "ld1w z14.s, p0/z, [c_ptr7]\n"
-                        "zip1 z28.s, z13.s, z14.s\n"
-                        "zip2 z29.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n"
-                        "zip1 z30.s, z13.s, z14.s\n"
-                        "zip2 z31.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p7/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1rqh z10.h, p7/z, [a_ptr6]\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z11.h, p7/z, [a_ptr7]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "add a_ptr6, a_ptr6, #0x20\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "add a_ptr7, a_ptr7, #0x20\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr6, #-0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr7, #-0x10]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p7/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p7/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1rqh z10.h, p7/z, [a_ptr6]\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z11.h, p7/z, [a_ptr7]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr6, #0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr7, #0x10]\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        "addvl a_ptr5, a_ptr5, #2\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        "addvl a_ptr6, a_ptr6, #2\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        "addvl a_ptr7, a_ptr7, #2\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        "ld1rqh z8.h, p6/z, [a_ptr4]\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        "ld1rqh z9.h, p6/z, [a_ptr5]\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        "addvl a_ptr5, a_ptr5, #1\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        "ld1rqh z10.h, p6/z, [a_ptr6]\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "ld1rqh z11.h, p6/z, [a_ptr7]\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        "addvl a_ptr6, a_ptr6, #1\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        "addvl a_ptr7, a_ptr7, #1\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
-                        ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
-                        ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
-                        ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
-                        ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
-                        ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
-                        ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
-                        ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
-                        ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
-                        ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
-                        ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
-                        ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
-                        ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
-                        ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
-                        ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
-                        ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
-                        ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
-                        ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
-                        ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
-                        ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
-                        ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
-                        ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
-                        ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
-                        ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
-                        ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
-                        ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
-                        ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
-                        ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
-                        ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "st1w z9.s, p0, [c_ptr5]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "uzp1 z12.s, z28.s, z29.s\n"
-                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
-                        "uzp2 z13.s, z28.s, z29.s\n"
-                        "uzp1 z14.s, z30.s, z31.s\n"
-                        "uzp2 z15.s, z30.s, z31.s\n"
-                        "st1w z12.s, p0, [c_ptr6]\n"
-                        "st1w z13.s, p0, [c_ptr7]\n"
-                        "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n"
-                        "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq a_ptr6\n"
-                        ".unreq a_ptr7\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        ".unreq c_ptr6\n"
-                        ".unreq c_ptr7\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
deleted file mode 100644
index bd457e9d27..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-class hybrid_bf16fp32_mmla_6VLx2
-{
-public:
-    typedef bfloat16 operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 4;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<float>() * 3;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 4;
-    }
-
-    static constexpr bool supports_accumulate()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 2, 6, 4> transforms = {};
-
-    // Default to the generic kernel
-    kern_type kernel=sve_hybrid_bf16fp32_mmla_6VLx2;
-
-    hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *)
-    {
-
-    }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
deleted file mode 100644
index 59dc6dc540..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
+++ /dev/null
@@ -1,1633 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 8) / 16) - 1;
-    K -= loops_count * 16;
-    const long regs_count = (K / 8) - 1;
-    K -= (regs_count + 1) * 8;
-    const long leftovers = K;
-    const long blocks_count = (K + 3) / 4;
-    float nullbias[192];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (3 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const bfloat16 * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(bfloat16);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(3 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (3 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const bfloat16 *a_ptr0 = a_ptr0_base;
-            const bfloat16 *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z1.h, #0\n"
-                        "ld1w z19.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "zip1 z20.s, z19.s, z19.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z21.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip1 z22.s, z19.s, z19.s\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip2 z23.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip1 z24.s, z19.s, z19.s\n"
-                        "zip2 z25.s, z19.s, z19.s\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z18.s, #0\n"
-                        "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
-                        "mov z1.h, #0\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z20.s, z17.s, z18.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z21.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z18.s, #0\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip1 z22.s, z17.s, z18.s\n"
-                        "zip2 z23.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "mov z18.s, #0\n"
-                        "zip1 z24.s, z17.s, z18.s\n"
-                        "zip2 z25.s, z17.s, z18.s\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "mov z1.h, #0\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z1.h, #0\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #12\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "5:\n"
-                        "ld1rw z18.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z19.s, p7/z, [%[maxptr]]\n"
-                        "fmax z20.s, p7/m, z20.s, z18.s\n"
-                        "fmax z21.s, p7/m, z21.s, z18.s\n"
-                        "fmax z22.s, p7/m, z22.s, z18.s\n"
-                        "fmax z23.s, p7/m, z23.s, z18.s\n"
-                        "fmin z20.s, p7/m, z20.s, z19.s\n"
-                        "fmin z21.s, p7/m, z21.s, z19.s\n"
-                        "fmin z22.s, p7/m, z22.s, z19.s\n"
-                        "fmin z23.s, p7/m, z23.s, z19.s\n"
-                        "fmax z24.s, p7/m, z24.s, z18.s\n"
-                        "uzp1 z0.s, z20.s, z21.s\n"
-                        "fmax z25.s, p7/m, z25.s, z18.s\n"
-                        "uzp1 z1.s, z22.s, z23.s\n"
-                        "fmin z24.s, p7/m, z24.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z25.s, p7/m, z25.s, z19.s\n"
-                        "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "uzp1 z2.s, z24.s, z25.s\n"
-                        "st1w z2.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z19.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z20.s, z19.s, z19.s\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z21.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z22.s, z19.s, z19.s\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip2 z23.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip1 z24.s, z19.s, z19.s\n"
-                        "zip2 z25.s, z19.s, z19.s\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z18.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z20.s, z17.s, z18.s\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z21.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z22.s, z17.s, z18.s\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip2 z23.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip1 z24.s, z17.s, z18.s\n"
-                        "zip2 z25.s, z17.s, z18.s\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #12\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "5:\n"
-                        "ld1rw z18.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z19.s, p7/z, [%[maxptr]]\n"
-                        "fmax z20.s, p7/m, z20.s, z18.s\n"
-                        "fmax z21.s, p7/m, z21.s, z18.s\n"
-                        "fmax z22.s, p7/m, z22.s, z18.s\n"
-                        "fmax z23.s, p7/m, z23.s, z18.s\n"
-                        "fmin z20.s, p7/m, z20.s, z19.s\n"
-                        "fmin z21.s, p7/m, z21.s, z19.s\n"
-                        "fmin z22.s, p7/m, z22.s, z19.s\n"
-                        "fmin z23.s, p7/m, z23.s, z19.s\n"
-                        "fmax z24.s, p7/m, z24.s, z18.s\n"
-                        "uzp1 z0.s, z20.s, z21.s\n"
-                        "uzp2 z1.s, z20.s, z21.s\n"
-                        "uzp1 z2.s, z22.s, z23.s\n"
-                        "uzp2 z3.s, z22.s, z23.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z24.s, p7/m, z24.s, z19.s\n"
-                        "fmax z25.s, p7/m, z25.s, z18.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmin z25.s, p7/m, z25.s, z19.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "uzp1 z4.s, z24.s, z25.s\n"
-                        "uzp2 z5.s, z24.s, z25.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #3\n"
-                        "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z3.h, #0\n"
-                        "ld1w z19.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z20.s, z19.s, z19.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z21.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z22.s, z19.s, z19.s\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip2 z23.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z26.d, z20.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z27.d, z21.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip1 z24.s, z19.s, z19.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip2 z25.s, z19.s, z19.s\n"
-                        "mov z28.d, z22.d\n"
-                        "mov z29.d, z23.d\n"
-                        "mov z30.d, z24.d\n"
-                        "mov z31.d, z25.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z3.h, #0\n"
-                        "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z18.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z20.s, z17.s, z18.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z21.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z22.s, z17.s, z18.s\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip2 z23.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip1 z24.s, z17.s, z18.s\n"
-                        "zip2 z25.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p0/z, [c_ptr2]\n"
-                        "mov z18.s, #0\n"
-                        "zip1 z26.s, z17.s, z18.s\n"
-                        "zip2 z27.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "mov z18.s, #0\n"
-                        "zip1 z28.s, z17.s, z18.s\n"
-                        "zip2 z29.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "mov z18.s, #0\n"
-                        "zip1 z30.s, z17.s, z18.s\n"
-                        "zip2 z31.s, z17.s, z18.s\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #12\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "5:\n"
-                        "ld1rw z18.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z19.s, p7/z, [%[maxptr]]\n"
-                        "fmax z20.s, p7/m, z20.s, z18.s\n"
-                        "fmax z21.s, p7/m, z21.s, z18.s\n"
-                        "fmax z22.s, p7/m, z22.s, z18.s\n"
-                        "fmax z23.s, p7/m, z23.s, z18.s\n"
-                        "fmin z20.s, p7/m, z20.s, z19.s\n"
-                        "fmin z21.s, p7/m, z21.s, z19.s\n"
-                        "fmin z22.s, p7/m, z22.s, z19.s\n"
-                        "fmin z23.s, p7/m, z23.s, z19.s\n"
-                        "fmax z24.s, p7/m, z24.s, z18.s\n"
-                        "uzp1 z0.s, z20.s, z21.s\n"
-                        "uzp2 z1.s, z20.s, z21.s\n"
-                        "uzp1 z2.s, z22.s, z23.s\n"
-                        "uzp2 z3.s, z22.s, z23.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z24.s, p7/m, z24.s, z19.s\n"
-                        "fmax z25.s, p7/m, z25.s, z18.s\n"
-                        "fmax z26.s, p7/m, z26.s, z18.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z27.s, p7/m, z27.s, z18.s\n"
-                        "fmax z28.s, p7/m, z28.s, z18.s\n"
-                        "fmin z25.s, p7/m, z25.s, z19.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z19.s\n"
-                        "fmin z27.s, p7/m, z27.s, z19.s\n"
-                        "fmin z28.s, p7/m, z28.s, z19.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z24.s, z25.s\n"
-                        "uzp2 z5.s, z24.s, z25.s\n"
-                        "uzp1 z6.s, z26.s, z27.s\n"
-                        "fmax z29.s, p7/m, z29.s, z18.s\n"
-                        "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmax z30.s, p7/m, z30.s, z18.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #3\n"
-                        "fmax z31.s, p7/m, z31.s, z18.s\n"
-                        "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmin z29.s, p7/m, z29.s, z19.s\n"
-                        "fmin z30.s, p7/m, z30.s, z19.s\n"
-                        "fmin z31.s, p7/m, z31.s, z19.s\n"
-                        "st1w z6.s, p0, [c_ptr2]\n"
-                        "uzp1 z7.s, z28.s, z29.s\n"
-                        "uzp1 z8.s, z30.s, z31.s\n"
-                        "st1w z7.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z8.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z19.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z20.s, z19.s, z19.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z21.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z22.s, z19.s, z19.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z23.s, z19.s, z19.s\n"
-                        "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z26.d, z20.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z27.d, z21.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip1 z24.s, z19.s, z19.s\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip2 z25.s, z19.s, z19.s\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z28.d, z22.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "mov z29.d, z23.d\n"
-                        "mov z30.d, z24.d\n"
-                        "mov z31.d, z25.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z18.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z20.s, z17.s, z18.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z21.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z22.s, z17.s, z18.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z23.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip1 z24.s, z17.s, z18.s\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip2 z25.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p0/z, [c_ptr2]\n"
-                        "ld1w z18.s, p0/z, [c_ptr3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        "zip1 z26.s, z17.s, z18.s\n"
-                        "zip2 z27.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z18.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z28.s, z17.s, z18.s\n"
-                        "zip2 z29.s, z17.s, z18.s\n"
-                        "ld1w z17.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "zip1 z30.s, z17.s, z18.s\n"
-                        "zip2 z31.s, z17.s, z18.s\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #12\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
-                        ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
-                        ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
-                        ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
-                        ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
-                        "b.eq 5f\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
-                        ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
-                        ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
-                        ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
-                        ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
-                        ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
-                        "5:\n"
-                        "ld1rw z18.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z19.s, p7/z, [%[maxptr]]\n"
-                        "fmax z20.s, p7/m, z20.s, z18.s\n"
-                        "fmax z21.s, p7/m, z21.s, z18.s\n"
-                        "fmax z22.s, p7/m, z22.s, z18.s\n"
-                        "fmax z23.s, p7/m, z23.s, z18.s\n"
-                        "fmin z20.s, p7/m, z20.s, z19.s\n"
-                        "fmin z21.s, p7/m, z21.s, z19.s\n"
-                        "fmin z22.s, p7/m, z22.s, z19.s\n"
-                        "fmin z23.s, p7/m, z23.s, z19.s\n"
-                        "fmax z24.s, p7/m, z24.s, z18.s\n"
-                        "uzp1 z0.s, z20.s, z21.s\n"
-                        "uzp2 z1.s, z20.s, z21.s\n"
-                        "uzp1 z2.s, z22.s, z23.s\n"
-                        "uzp2 z3.s, z22.s, z23.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z24.s, p7/m, z24.s, z19.s\n"
-                        "fmax z25.s, p7/m, z25.s, z18.s\n"
-                        "fmax z26.s, p7/m, z26.s, z18.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z27.s, p7/m, z27.s, z18.s\n"
-                        "fmax z28.s, p7/m, z28.s, z18.s\n"
-                        "fmin z25.s, p7/m, z25.s, z19.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z19.s\n"
-                        "fmin z27.s, p7/m, z27.s, z19.s\n"
-                        "fmin z28.s, p7/m, z28.s, z19.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z24.s, z25.s\n"
-                        "uzp2 z5.s, z24.s, z25.s\n"
-                        "uzp1 z6.s, z26.s, z27.s\n"
-                        "uzp2 z7.s, z26.s, z27.s\n"
-                        "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmax z29.s, p7/m, z29.s, z18.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #3\n"
-                        "fmax z30.s, p7/m, z30.s, z18.s\n"
-                        "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmax z31.s, p7/m, z31.s, z18.s\n"
-                        "fmin z29.s, p7/m, z29.s, z19.s\n"
-                        "fmin z30.s, p7/m, z30.s, z19.s\n"
-                        "st1w z6.s, p0, [c_ptr2]\n"
-                        "fmin z31.s, p7/m, z31.s, z19.s\n"
-                        "uzp1 z8.s, z28.s, z29.s\n"
-                        "uzp2 z9.s, z28.s, z29.s\n"
-                        "st1w z7.s, p0, [c_ptr3]\n"
-                        "uzp1 z10.s, z30.s, z31.s\n"
-                        "uzp2 z11.s, z30.s, z31.s\n"
-                        "st1w z8.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z9.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z10.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z11.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
deleted file mode 100644
index f25f7473cb..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-class hybrid_bf16fp32_mmla_8VLx2
-{
-public:
-    typedef bfloat16 operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 4;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<float>() * 4;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 4;
-    }
-
-    static constexpr bool supports_accumulate()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 2, 8, 4> transforms = {};
-
-    // Default to the generic kernel
-    kern_type kernel=sve_hybrid_bf16fp32_mmla_8VLx2;
-
-    hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *)
-    {
-
-    }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
deleted file mode 100644
index f38a2ea2e3..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
+++ /dev/null
@@ -1,2001 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 8) / 16) - 1;
-    K -= loops_count * 16;
-    const long regs_count = (K / 8) - 1;
-    K -= (regs_count + 1) * 8;
-    const long leftovers = K;
-    const long blocks_count = (K + 3) / 4;
-    float nullbias[256];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const bfloat16 * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(bfloat16);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const bfloat16 *a_ptr0 = a_ptr0_base;
-            const bfloat16 *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z1.h, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "zip1 z20.s, z15.s, z15.s\n"
-                        "zip2 z21.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "zip1 z22.s, z15.s, z15.s\n"
-                        "zip2 z23.s, z15.s, z15.s\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z14.s, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "mov z1.h, #0\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "mov z1.h, #0\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z1.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "uzp1 z1.s, z18.s, z19.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "uzp1 z2.s, z20.s, z21.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z2.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "uzp1 z3.s, z22.s, z23.s\n"
-                        "st1w z3.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "zip1 z20.s, z15.s, z15.s\n"
-                        "zip2 z21.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "zip1 z22.s, z15.s, z15.s\n"
-                        "zip2 z23.s, z15.s, z15.s\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z3.h, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip1 z20.s, z15.s, z15.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "zip2 z21.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "zip1 z22.s, z15.s, z15.s\n"
-                        "zip2 z23.s, z15.s, z15.s\n"
-                        "mov z28.d, z20.d\n"
-                        "mov z29.d, z21.d\n"
-                        "mov z30.d, z22.d\n"
-                        "mov z31.d, z23.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z3.h, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z28.s, z13.s, z14.s\n"
-                        "zip2 z29.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z30.s, z13.s, z14.s\n"
-                        "zip2 z31.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "mov z3.h, #0\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z5.h, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "uzp1 z9.s, z26.s, z27.s\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "st1w z8.s, p0, [c_ptr2]\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "st1w z9.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "uzp1 z10.s, z28.s, z29.s\n"
-                        "uzp1 z11.s, z30.s, z31.s\n"
-                        "st1w z10.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z11.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip1 z20.s, z15.s, z15.s\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip2 z21.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "mov z28.d, z20.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z22.s, z15.s, z15.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "zip2 z23.s, z15.s, z15.s\n"
-                        "mov z29.d, z21.d\n"
-                        "mov z30.d, z22.d\n"
-                        "mov z31.d, z23.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "zip1 z28.s, z13.s, z14.s\n"
-                        "zip2 z29.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1w z14.s, p3/z, [c_ptr3, #3, MUL VL]\n"
-                        "zip1 z30.s, z13.s, z14.s\n"
-                        "zip2 z31.s, z13.s, z14.s\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1rqh z4.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "trn1 z4.d, z0.d, z1.d\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "trn1 z5.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        "ld1rqh z4.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        "trn1 z0.d, z2.d, z3.d\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "trn1 z1.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z5.d, z4.d, z5.d\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z4.d, z2.d, z3.d\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
-                        ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
-                        ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
-                        ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
-                        ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
-                        ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
-                        ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
-                        ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
-                        ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
-                        ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
-                        ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
-                        ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
-                        ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
-                        ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
-                        ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
-                        ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
-                        ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
-                        ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
-                        ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
-                        ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
-                        ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
-                        ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr2]\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "st1w z9.s, p0, [c_ptr3]\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "st1w z10.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "uzp1 z12.s, z28.s, z29.s\n"
-                        "st1w z11.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp2 z13.s, z28.s, z29.s\n"
-                        "uzp1 z14.s, z30.s, z31.s\n"
-                        "uzp2 z15.s, z30.s, z31.s\n"
-                        "st1w z12.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z13.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z14.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z15.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
deleted file mode 100644
index 7610a20ac0..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,3778 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 *C, int ldc, int M, int N, int K, const __fp16 *bias, Activation act, bool accumulate) {
-    const int K_stride = K;
-    const long loops_count = ((K + 8) / 16) - 1;
-    K -= loops_count * 16;
-    const long regs_count = (K / 8) - 1;
-    K -= (regs_count + 1) * 8;
-    const long leftovers = K;
-    __fp16 nullbias[512];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (4 * get_vector_length<__fp16>() * sizeof(__fp16)));
-    }
-    __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
-    __fp16 maxval =   static_cast<__fp16>(std::numeric_limits<float>::infinity());
-    const __fp16 * const minptr = &minval;
-    const __fp16 * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<__fp16>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const __fp16 * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(__fp16);
-
-        __fp16 *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = leftovers;
-            const __fp16 *a_ptr0 = a_ptr0_base;
-            const __fp16 *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(__fp16);
-            const __fp16 *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p2.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p3.h, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1h z16.h, p0/z, [%[biasptr]]\n"
-                        "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
-                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "5:\n"
-                        "ld1rh z14.h, p7/z, [%[minptr]]\n"
-                        "ld1rh z15.h, p7/z, [%[maxptr]]\n"
-                        "fmax z16.h, p7/m, z16.h, z14.h\n"
-                        "fmax z17.h, p7/m, z17.h, z14.h\n"
-                        "fmax z18.h, p7/m, z18.h, z14.h\n"
-                        "fmax z19.h, p7/m, z19.h, z14.h\n"
-                        "fmin z16.h, p7/m, z16.h, z15.h\n"
-                        "fmin z17.h, p7/m, z17.h, z15.h\n"
-                        "fmin z18.h, p7/m, z18.h, z15.h\n"
-                        "fmin z19.h, p7/m, z19.h, z15.h\n"
-                        "st1h z16.h, p0, [%[c_ptr0]]\n"
-                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p2.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p3.h, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1h z16.h, p0/z, [%[biasptr]]\n"
-                        "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
-                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1h z20.h, p0/z, [c_ptr1]\n"
-                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "5:\n"
-                        "ld1rh z14.h, p7/z, [%[minptr]]\n"
-                        "ld1rh z15.h, p7/z, [%[maxptr]]\n"
-                        "fmax z16.h, p7/m, z16.h, z14.h\n"
-                        "fmax z17.h, p7/m, z17.h, z14.h\n"
-                        "fmax z18.h, p7/m, z18.h, z14.h\n"
-                        "fmax z19.h, p7/m, z19.h, z14.h\n"
-                        "fmin z16.h, p7/m, z16.h, z15.h\n"
-                        "fmin z17.h, p7/m, z17.h, z15.h\n"
-                        "fmin z18.h, p7/m, z18.h, z15.h\n"
-                        "fmin z19.h, p7/m, z19.h, z15.h\n"
-                        "st1h z16.h, p0, [%[c_ptr0]]\n"
-                        "fmax z20.h, p7/m, z20.h, z14.h\n"
-                        "fmax z21.h, p7/m, z21.h, z14.h\n"
-                        "fmax z22.h, p7/m, z22.h, z14.h\n"
-                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.h, p7/m, z23.h, z14.h\n"
-                        "fmin z20.h, p7/m, z20.h, z15.h\n"
-                        "fmin z21.h, p7/m, z21.h, z15.h\n"
-                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.h, p7/m, z22.h, z15.h\n"
-                        "fmin z23.h, p7/m, z23.h, z15.h\n"
-                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1h z20.h, p0, [c_ptr1]\n"
-                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p2.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p3.h, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1h z16.h, p0/z, [%[biasptr]]\n"
-                        "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
-                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1h z20.h, p0/z, [c_ptr1]\n"
-                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1h z24.h, p0/z, [c_ptr2]\n"
-                        "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z24.h, z12.h, z6.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z25.h, z13.h, z6.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z26.h, z14.h, z6.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "fmla z27.h, z15.h, z6.h[7]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z24.h, z12.h, z6.h[7]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z25.h, z13.h, z6.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z26.h, z14.h, z6.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "fmla z27.h, z15.h, z6.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "5:\n"
-                        "ld1rh z14.h, p7/z, [%[minptr]]\n"
-                        "ld1rh z15.h, p7/z, [%[maxptr]]\n"
-                        "fmax z16.h, p7/m, z16.h, z14.h\n"
-                        "fmax z17.h, p7/m, z17.h, z14.h\n"
-                        "fmax z18.h, p7/m, z18.h, z14.h\n"
-                        "fmax z19.h, p7/m, z19.h, z14.h\n"
-                        "fmin z16.h, p7/m, z16.h, z15.h\n"
-                        "fmin z17.h, p7/m, z17.h, z15.h\n"
-                        "fmin z18.h, p7/m, z18.h, z15.h\n"
-                        "fmin z19.h, p7/m, z19.h, z15.h\n"
-                        "st1h z16.h, p0, [%[c_ptr0]]\n"
-                        "fmax z20.h, p7/m, z20.h, z14.h\n"
-                        "fmax z21.h, p7/m, z21.h, z14.h\n"
-                        "fmax z22.h, p7/m, z22.h, z14.h\n"
-                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.h, p7/m, z23.h, z14.h\n"
-                        "fmin z20.h, p7/m, z20.h, z15.h\n"
-                        "fmin z21.h, p7/m, z21.h, z15.h\n"
-                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.h, p7/m, z22.h, z15.h\n"
-                        "fmin z23.h, p7/m, z23.h, z15.h\n"
-                        "fmax z24.h, p7/m, z24.h, z14.h\n"
-                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.h, p7/m, z25.h, z14.h\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.h, p7/m, z26.h, z14.h\n"
-                        "st1h z20.h, p0, [c_ptr1]\n"
-                        "fmin z24.h, p7/m, z24.h, z15.h\n"
-                        "fmin z25.h, p7/m, z25.h, z15.h\n"
-                        "fmax z27.h, p7/m, z27.h, z14.h\n"
-                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.h, p7/m, z26.h, z15.h\n"
-                        "fmin z27.h, p7/m, z27.h, z15.h\n"
-                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1h z24.h, p0, [c_ptr2]\n"
-                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "whilelt p1.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p2.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "whilelt p3.h, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1h z16.h, p0/z, [%[biasptr]]\n"
-                        "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z28.d, z16.d\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z29.d, z17.d\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z30.d, z18.d\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z31.d, z19.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
-                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1h z20.h, p0/z, [c_ptr1]\n"
-                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1h z24.h, p0/z, [c_ptr2]\n"
-                        "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1h z28.h, p0/z, [c_ptr3]\n"
-                        "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z28.h, z8.h, z3.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z29.h, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla z30.h, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "fmla z31.h, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z28.h, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z29.h, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z30.h, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "fmla z31.h, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z28.h, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z29.h, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z30.h, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "fmla z31.h, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z28.h, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z29.h, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z30.h, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "fmla z31.h, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z28.h, z8.h, z3.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z29.h, z9.h, z3.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z30.h, z10.h, z3.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "fmla z31.h, z11.h, z3.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z28.h, z12.h, z3.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z29.h, z13.h, z3.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z30.h, z14.h, z3.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "fmla z31.h, z15.h, z3.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z28.h, z8.h, z3.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z29.h, z9.h, z3.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z30.h, z10.h, z3.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z31.h, z11.h, z3.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "fmla z28.h, z12.h, z3.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "fmla z29.h, z13.h, z3.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "fmla z30.h, z14.h, z3.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        "fmla z31.h, z15.h, z3.h[7]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "fmla z28.h, z8.h, z7.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "fmla z29.h, z9.h, z7.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "fmla z30.h, z10.h, z7.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "fmla z31.h, z11.h, z7.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "fmla z28.h, z12.h, z7.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "fmla z29.h, z13.h, z7.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "fmla z30.h, z14.h, z7.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "fmla z31.h, z15.h, z7.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z28.h, z8.h, z7.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "fmla z29.h, z9.h, z7.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "fmla z30.h, z10.h, z7.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "fmla z31.h, z11.h, z7.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "fmla z28.h, z12.h, z7.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "fmla z29.h, z13.h, z7.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "fmla z30.h, z14.h, z7.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "fmla z31.h, z15.h, z7.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "fmla z28.h, z8.h, z7.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "fmla z29.h, z9.h, z7.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "fmla z30.h, z10.h, z7.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "fmla z31.h, z11.h, z7.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "fmla z28.h, z12.h, z7.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "fmla z29.h, z13.h, z7.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "fmla z30.h, z14.h, z7.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "fmla z31.h, z15.h, z7.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z28.h, z8.h, z7.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z29.h, z9.h, z7.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z30.h, z10.h, z7.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "fmla z31.h, z11.h, z7.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z24.h, z12.h, z6.h[7]\n"
-                        "fmla z28.h, z12.h, z7.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z25.h, z13.h, z6.h[7]\n"
-                        "fmla z29.h, z13.h, z7.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z26.h, z14.h, z6.h[7]\n"
-                        "fmla z30.h, z14.h, z7.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "fmla z27.h, z15.h, z6.h[7]\n"
-                        "fmla z31.h, z15.h, z7.h[7]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z28.h, z8.h, z3.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "fmla z29.h, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "fmla z30.h, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "fmla z31.h, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z28.h, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z29.h, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z30.h, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "fmla z31.h, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z28.h, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z29.h, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z30.h, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "fmla z31.h, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z28.h, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z29.h, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z30.h, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "fmla z31.h, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z28.h, z8.h, z3.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z29.h, z9.h, z3.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z30.h, z10.h, z3.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "fmla z31.h, z11.h, z3.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z28.h, z12.h, z3.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z29.h, z13.h, z3.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z30.h, z14.h, z3.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "fmla z31.h, z15.h, z3.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z28.h, z8.h, z3.h[6]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z29.h, z9.h, z3.h[6]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z30.h, z10.h, z3.h[6]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z31.h, z11.h, z3.h[6]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "fmla z28.h, z12.h, z3.h[7]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "fmla z29.h, z13.h, z3.h[7]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "fmla z30.h, z14.h, z3.h[7]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        "fmla z31.h, z15.h, z3.h[7]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z28.h, z8.h, z7.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "fmla z29.h, z9.h, z7.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "fmla z30.h, z10.h, z7.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "fmla z31.h, z11.h, z7.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "fmla z28.h, z12.h, z7.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "fmla z29.h, z13.h, z7.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "fmla z30.h, z14.h, z7.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "fmla z31.h, z15.h, z7.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z28.h, z8.h, z7.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "fmla z29.h, z9.h, z7.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "fmla z30.h, z10.h, z7.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "fmla z31.h, z11.h, z7.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "fmla z28.h, z12.h, z7.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "fmla z29.h, z13.h, z7.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "fmla z30.h, z14.h, z7.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "fmla z31.h, z15.h, z7.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "fmla z28.h, z8.h, z7.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "fmla z29.h, z9.h, z7.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "fmla z30.h, z10.h, z7.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "fmla z31.h, z11.h, z7.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "fmla z28.h, z12.h, z7.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "fmla z29.h, z13.h, z7.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "fmla z30.h, z14.h, z7.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "fmla z31.h, z15.h, z7.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z28.h, z8.h, z7.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z29.h, z9.h, z7.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z30.h, z10.h, z7.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "fmla z31.h, z11.h, z7.h[6]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z24.h, z12.h, z6.h[7]\n"
-                        "fmla z28.h, z12.h, z7.h[7]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z25.h, z13.h, z6.h[7]\n"
-                        "fmla z29.h, z13.h, z7.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z26.h, z14.h, z6.h[7]\n"
-                        "fmla z30.h, z14.h, z7.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "fmla z27.h, z15.h, z6.h[7]\n"
-                        "fmla z31.h, z15.h, z7.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "fmla z28.h, z8.h, z3.h[0]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "fmla z29.h, z9.h, z3.h[0]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "fmla z30.h, z10.h, z3.h[0]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "fmla z31.h, z11.h, z3.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z28.h, z12.h, z3.h[1]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z29.h, z13.h, z3.h[1]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z30.h, z14.h, z3.h[1]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "fmla z31.h, z15.h, z3.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z28.h, z8.h, z3.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z29.h, z9.h, z3.h[2]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z30.h, z10.h, z3.h[2]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "fmla z31.h, z11.h, z3.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z28.h, z12.h, z3.h[3]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z29.h, z13.h, z3.h[3]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z30.h, z14.h, z3.h[3]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "fmla z31.h, z15.h, z3.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z28.h, z8.h, z3.h[4]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z29.h, z9.h, z3.h[4]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z30.h, z10.h, z3.h[4]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "fmla z31.h, z11.h, z3.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z28.h, z12.h, z3.h[5]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z29.h, z13.h, z3.h[5]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z30.h, z14.h, z3.h[5]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "fmla z31.h, z15.h, z3.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z28.h, z8.h, z3.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z29.h, z9.h, z3.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z30.h, z10.h, z3.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z31.h, z11.h, z3.h[6]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        "fmla z28.h, z8.h, z3.h[0]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z29.h, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        "fmla z30.h, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "fmla z31.h, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z28.h, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z29.h, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z30.h, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "fmla z31.h, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z28.h, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z29.h, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z30.h, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "fmla z31.h, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z28.h, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z29.h, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z30.h, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "fmla z31.h, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z28.h, z8.h, z3.h[4]\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z29.h, z9.h, z3.h[4]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z30.h, z10.h, z3.h[4]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "fmla z31.h, z11.h, z3.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z28.h, z12.h, z3.h[5]\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z29.h, z13.h, z3.h[5]\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z30.h, z14.h, z3.h[5]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "fmla z31.h, z15.h, z3.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z28.h, z8.h, z3.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z29.h, z9.h, z3.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z30.h, z10.h, z3.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z31.h, z11.h, z3.h[6]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "fmla z28.h, z12.h, z3.h[7]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "fmla z29.h, z13.h, z3.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "fmla z30.h, z14.h, z3.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "fmla z31.h, z15.h, z3.h[7]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "fmla z28.h, z8.h, z7.h[0]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "fmla z29.h, z9.h, z7.h[0]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "fmla z30.h, z10.h, z7.h[0]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "fmla z31.h, z11.h, z7.h[0]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "fmla z28.h, z12.h, z7.h[1]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "fmla z29.h, z13.h, z7.h[1]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "fmla z30.h, z14.h, z7.h[1]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "fmla z31.h, z15.h, z7.h[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z28.h, z8.h, z7.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "fmla z29.h, z9.h, z7.h[2]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "fmla z30.h, z10.h, z7.h[2]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "fmla z31.h, z11.h, z7.h[2]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "fmla z28.h, z12.h, z7.h[3]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "fmla z29.h, z13.h, z7.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "fmla z30.h, z14.h, z7.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "fmla z31.h, z15.h, z7.h[3]\n"
-                        "b.eq 5f\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "fmla z28.h, z8.h, z7.h[4]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "fmla z29.h, z9.h, z7.h[4]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "fmla z30.h, z10.h, z7.h[4]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "fmla z31.h, z11.h, z7.h[4]\n"
-                        "b.eq 5f\n"
-                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "fmla z28.h, z12.h, z7.h[5]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "fmla z29.h, z13.h, z7.h[5]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "fmla z30.h, z14.h, z7.h[5]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "fmla z31.h, z15.h, z7.h[5]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z28.h, z8.h, z7.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z29.h, z9.h, z7.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z30.h, z10.h, z7.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "fmla z31.h, z11.h, z7.h[6]\n"
-                        "5:\n"
-                        "ld1rh z14.h, p7/z, [%[minptr]]\n"
-                        "ld1rh z15.h, p7/z, [%[maxptr]]\n"
-                        "fmax z16.h, p7/m, z16.h, z14.h\n"
-                        "fmax z17.h, p7/m, z17.h, z14.h\n"
-                        "fmax z18.h, p7/m, z18.h, z14.h\n"
-                        "fmax z19.h, p7/m, z19.h, z14.h\n"
-                        "fmin z16.h, p7/m, z16.h, z15.h\n"
-                        "fmin z17.h, p7/m, z17.h, z15.h\n"
-                        "fmin z18.h, p7/m, z18.h, z15.h\n"
-                        "fmin z19.h, p7/m, z19.h, z15.h\n"
-                        "st1h z16.h, p0, [%[c_ptr0]]\n"
-                        "fmax z20.h, p7/m, z20.h, z14.h\n"
-                        "fmax z21.h, p7/m, z21.h, z14.h\n"
-                        "fmax z22.h, p7/m, z22.h, z14.h\n"
-                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.h, p7/m, z23.h, z14.h\n"
-                        "fmin z20.h, p7/m, z20.h, z15.h\n"
-                        "fmin z21.h, p7/m, z21.h, z15.h\n"
-                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.h, p7/m, z22.h, z15.h\n"
-                        "fmin z23.h, p7/m, z23.h, z15.h\n"
-                        "fmax z24.h, p7/m, z24.h, z14.h\n"
-                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.h, p7/m, z25.h, z14.h\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.h, p7/m, z26.h, z14.h\n"
-                        "st1h z20.h, p0, [c_ptr1]\n"
-                        "fmin z24.h, p7/m, z24.h, z15.h\n"
-                        "fmin z25.h, p7/m, z25.h, z15.h\n"
-                        "fmax z27.h, p7/m, z27.h, z14.h\n"
-                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.h, p7/m, z26.h, z15.h\n"
-                        "fmax z28.h, p7/m, z28.h, z14.h\n"
-                        "fmax z29.h, p7/m, z29.h, z14.h\n"
-                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmin z27.h, p7/m, z27.h, z15.h\n"
-                        "fmax z30.h, p7/m, z30.h, z14.h\n"
-                        "fmin z28.h, p7/m, z28.h, z15.h\n"
-                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
-                        "fmin z29.h, p7/m, z29.h, z15.h\n"
-                        "fmax z31.h, p7/m, z31.h, z14.h\n"
-                        "fmin z30.h, p7/m, z30.h, z15.h\n"
-                        "st1h z24.h, p0, [c_ptr2]\n"
-                        "fmin z31.h, p7/m, z31.h, z15.h\n"
-                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1h z28.h, p0, [c_ptr3]\n"
-                        "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
similarity index 65%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
index ebef413848..0260050f29 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,42 +10,48 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __ARM_FEATURE_SVE
 
-
 #include "../std_transforms_sve.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<__fp16>, \
+   size_t, size_t, \
+   const __fp16 *, \
+   IndirectOutputArg<__fp16>, \
+   const __fp16 *, Activation, bool
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_hybrid_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, __fp16 *, int, int, int, int, const __fp16 *, Activation, bool);
+void sve_hybrid_fp16_mla_6x4VL( ARGLIST );
 
-class hybrid_fp16_mla_4VLx4
+class cls_sve_hybrid_fp16_mla_6x4VL
 {
 public:
     typedef __fp16 operand_type;
     typedef __fp16 result_type;
 
-    typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, __fp16 *, int, int, int, int, const __fp16 *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
     {
-        return 4;
+        return 6;
     }
 
     static unsigned int out_width()
@@ -63,27 +69,17 @@ class hybrid_fp16_mla_4VLx4
         return true;
     }
 
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 6, 4, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_hybrid_fp16_mla_4VLx4;
+    kern_type kernel=sve_hybrid_fp16_mla_6x4VL;
 
-    hybrid_fp16_mla_4VLx4(const CPUInfo *)
+    cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..b19842b122
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
@@ -0,0 +1,3178 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp16_mla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+    size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg,
+    const __fp16 *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const __fp16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p3.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x16\n"
+      "cbz x14, 4f\n"
+      "ld1h { z8.h }, p5/Z, [x14]\n"
+      "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 9f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "cmp x11, #0x8\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "cmp x11, #0x8\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "ble 12f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "12:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 7b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 13f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z1.h }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z0.h }, p5/Z, [x19]\n"
+      "fmin z8.h, p5/M, z8.h, z0.h\n"
+      "fmin z9.h, p5/M, z9.h, z0.h\n"
+      "fmin z10.h, p5/M, z10.h, z0.h\n"
+      "fmin z11.h, p5/M, z11.h, z0.h\n"
+      "fmax z8.h, p5/M, z8.h, z1.h\n"
+      "fmax z9.h, p5/M, z9.h, z1.h\n"
+      "fmax z10.h, p5/M, z10.h, z1.h\n"
+      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "13:"  // Height 1: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "14:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "inch x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 3b\n"
+      "b 86f\n"
+      "15:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 16f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "17:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p3.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x16\n"
+      "cbz x14, 18f\n"
+      "ld1h { z8.h }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "b 20f\n"
+      "18:"  // Height 2: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x9]\n"
+      "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "b 20f\n"
+      "19:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "20:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "21:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 23f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "b 23f\n"
+      "22:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "23:"  // Height 2: input setup done
+      "cmp x11, #0x8\n"
+      "ble 25f\n"
+      "24:"  // Height 2: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "cmp x11, #0x8\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "bgt 24b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "ble 26f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 21b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z1.h }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z0.h }, p5/Z, [x19]\n"
+      "fmin z8.h, p5/M, z8.h, z0.h\n"
+      "fmin z9.h, p5/M, z9.h, z0.h\n"
+      "fmin z10.h, p5/M, z10.h, z0.h\n"
+      "fmin z11.h, p5/M, z11.h, z0.h\n"
+      "fmin z12.h, p5/M, z12.h, z0.h\n"
+      "fmax z8.h, p5/M, z8.h, z1.h\n"
+      "fmax z9.h, p5/M, z9.h, z1.h\n"
+      "fmax z10.h, p5/M, z10.h, z1.h\n"
+      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "fmax z12.h, p5/M, z12.h, z1.h\n"
+      "fmin z13.h, p5/M, z13.h, z0.h\n"
+      "fmin z14.h, p5/M, z14.h, z0.h\n"
+      "fmin z15.h, p5/M, z15.h, z0.h\n"
+      "fmax z13.h, p5/M, z13.h, z1.h\n"
+      "fmax z14.h, p5/M, z14.h, z1.h\n"
+      "fmax z15.h, p5/M, z15.h, z1.h\n"
+      "27:"  // Height 2: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x9]\n"
+      "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "28:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "inch x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 17b\n"
+      "b 86f\n"
+      "29:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 30f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "add x27, x27, x19, LSL #1\n"
+      "b 31f\n"
+      "30:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "31:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p3.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x16\n"
+      "cbz x14, 32f\n"
+      "ld1h { z8.h }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+      "mov z13.d, z9.d\n"
+      "addvl x14, x14, #4\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 34f\n"
+      "32:"  // Height 3: no bias
+      "tbz %x[flags], #0, 33f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x9]\n"
+      "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x27]\n"
+      "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+      "b 34f\n"
+      "33:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "37:"  // Height 3: input setup done
+      "cmp x11, #0x8\n"
+      "ble 39f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "cmp x11, #0x8\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "bgt 38b\n"
+      "39:"  // Height 3: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "ble 40f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "40:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 35b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 41f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z1.h }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z0.h }, p5/Z, [x19]\n"
+      "fmin z8.h, p5/M, z8.h, z0.h\n"
+      "fmin z9.h, p5/M, z9.h, z0.h\n"
+      "fmin z10.h, p5/M, z10.h, z0.h\n"
+      "fmin z11.h, p5/M, z11.h, z0.h\n"
+      "fmin z12.h, p5/M, z12.h, z0.h\n"
+      "fmax z8.h, p5/M, z8.h, z1.h\n"
+      "fmax z9.h, p5/M, z9.h, z1.h\n"
+      "fmax z10.h, p5/M, z10.h, z1.h\n"
+      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "fmax z12.h, p5/M, z12.h, z1.h\n"
+      "fmin z13.h, p5/M, z13.h, z0.h\n"
+      "fmin z14.h, p5/M, z14.h, z0.h\n"
+      "fmin z15.h, p5/M, z15.h, z0.h\n"
+      "fmin z16.h, p5/M, z16.h, z0.h\n"
+      "fmax z13.h, p5/M, z13.h, z1.h\n"
+      "fmax z14.h, p5/M, z14.h, z1.h\n"
+      "fmax z15.h, p5/M, z15.h, z1.h\n"
+      "fmax z16.h, p5/M, z16.h, z1.h\n"
+      "fmin z17.h, p5/M, z17.h, z0.h\n"
+      "fmin z18.h, p5/M, z18.h, z0.h\n"
+      "fmin z19.h, p5/M, z19.h, z0.h\n"
+      "fmax z17.h, p5/M, z17.h, z1.h\n"
+      "fmax z18.h, p5/M, z18.h, z1.h\n"
+      "fmax z19.h, p5/M, z19.h, z1.h\n"
+      "41:"  // Height 3: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x9]\n"
+      "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z16.h }, p4, [x27]\n"
+      "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "42:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "inch x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 31b\n"
+      "b 86f\n"
+      "43:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 44f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #1\n"
+      "add x25, x25, x19, LSL #1\n"
+      "b 45f\n"
+      "44:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "add x25, x27, x19, LSL #1\n"
+      "45:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p3.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x16\n"
+      "cbz x14, 46f\n"
+      "ld1h { z8.h }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 48f\n"
+      "46:"  // Height 4: no bias
+      "tbz %x[flags], #0, 47f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x9]\n"
+      "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x27]\n"
+      "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x25]\n"
+      "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
+      "b 48f\n"
+      "47:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "48:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "49:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 51f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "b 51f\n"
+      "50:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "51:"  // Height 4: input setup done
+      "cmp x11, #0x8\n"
+      "ble 53f\n"
+      "52:"  // Height 4: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x8\n"
+      "fmla z20.h, z6.h, z3.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z21.h, z7.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z22.h, z6.h, z3.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z23.h, z7.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "fmla z20.h, z6.h, z3.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z21.h, z7.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z22.h, z6.h, z3.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z23.h, z7.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "fmla z20.h, z6.h, z3.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z21.h, z7.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z22.h, z6.h, z3.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z23.h, z7.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "fmla z20.h, z6.h, z3.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z21.h, z7.h, z3.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z22.h, z6.h, z3.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z23.h, z7.h, z3.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "fmla z20.h, z6.h, z3.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z21.h, z7.h, z3.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z22.h, z6.h, z3.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z23.h, z7.h, z3.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "fmla z20.h, z6.h, z3.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z21.h, z7.h, z3.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z22.h, z6.h, z3.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z23.h, z7.h, z3.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "fmla z20.h, z6.h, z3.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z21.h, z7.h, z3.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z22.h, z6.h, z3.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z23.h, z7.h, z3.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "fmla z20.h, z6.h, z3.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "fmla z21.h, z7.h, z3.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z22.h, z6.h, z3.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z23.h, z7.h, z3.h[7]\n"
+      "bgt 52b\n"
+      "53:"  // Height 4: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "fmla z20.h, z6.h, z3.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z21.h, z7.h, z3.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z22.h, z6.h, z3.h[0]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z23.h, z7.h, z3.h[0]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "fmla z20.h, z6.h, z3.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z21.h, z7.h, z3.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z22.h, z6.h, z3.h[1]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z23.h, z7.h, z3.h[1]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "fmla z20.h, z6.h, z3.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z21.h, z7.h, z3.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z22.h, z6.h, z3.h[2]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z23.h, z7.h, z3.h[2]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "fmla z20.h, z6.h, z3.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z21.h, z7.h, z3.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z22.h, z6.h, z3.h[3]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z23.h, z7.h, z3.h[3]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "fmla z20.h, z6.h, z3.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z21.h, z7.h, z3.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z22.h, z6.h, z3.h[4]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z23.h, z7.h, z3.h[4]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "fmla z20.h, z6.h, z3.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z21.h, z7.h, z3.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z22.h, z6.h, z3.h[5]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z23.h, z7.h, z3.h[5]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "fmla z20.h, z6.h, z3.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z21.h, z7.h, z3.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z22.h, z6.h, z3.h[6]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z23.h, z7.h, z3.h[6]\n"
+      "ble 54f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "fmla z20.h, z6.h, z3.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "fmla z21.h, z7.h, z3.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z22.h, z6.h, z3.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z23.h, z7.h, z3.h[7]\n"
+      "54:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 49b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 55f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z1.h }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z0.h }, p5/Z, [x19]\n"
+      "fmin z8.h, p5/M, z8.h, z0.h\n"
+      "fmin z9.h, p5/M, z9.h, z0.h\n"
+      "fmin z10.h, p5/M, z10.h, z0.h\n"
+      "fmin z11.h, p5/M, z11.h, z0.h\n"
+      "fmin z12.h, p5/M, z12.h, z0.h\n"
+      "fmax z8.h, p5/M, z8.h, z1.h\n"
+      "fmax z9.h, p5/M, z9.h, z1.h\n"
+      "fmax z10.h, p5/M, z10.h, z1.h\n"
+      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "fmax z12.h, p5/M, z12.h, z1.h\n"
+      "fmin z13.h, p5/M, z13.h, z0.h\n"
+      "fmin z14.h, p5/M, z14.h, z0.h\n"
+      "fmin z15.h, p5/M, z15.h, z0.h\n"
+      "fmin z16.h, p5/M, z16.h, z0.h\n"
+      "fmax z13.h, p5/M, z13.h, z1.h\n"
+      "fmax z14.h, p5/M, z14.h, z1.h\n"
+      "fmax z15.h, p5/M, z15.h, z1.h\n"
+      "fmax z16.h, p5/M, z16.h, z1.h\n"
+      "fmin z17.h, p5/M, z17.h, z0.h\n"
+      "fmin z18.h, p5/M, z18.h, z0.h\n"
+      "fmin z19.h, p5/M, z19.h, z0.h\n"
+      "fmin z20.h, p5/M, z20.h, z0.h\n"
+      "fmax z17.h, p5/M, z17.h, z1.h\n"
+      "fmax z18.h, p5/M, z18.h, z1.h\n"
+      "fmax z19.h, p5/M, z19.h, z1.h\n"
+      "fmax z20.h, p5/M, z20.h, z1.h\n"
+      "fmin z21.h, p5/M, z21.h, z0.h\n"
+      "fmin z22.h, p5/M, z22.h, z0.h\n"
+      "fmin z23.h, p5/M, z23.h, z0.h\n"
+      "fmax z21.h, p5/M, z21.h, z1.h\n"
+      "fmax z22.h, p5/M, z22.h, z1.h\n"
+      "fmax z23.h, p5/M, z23.h, z1.h\n"
+      "55:"  // Height 4: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x9]\n"
+      "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z16.h }, p4, [x27]\n"
+      "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1h { z20.h }, p4, [x25]\n"
+      "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "56:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "inch x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 45b\n"
+      "b 86f\n"
+      "57:"  // Height 5
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 58f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #1\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "b 59f\n"
+      "58:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "add x25, x27, x19, LSL #1\n"
+      "add x23, x25, x19, LSL #1\n"
+      "59:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p3.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x16\n"
+      "cbz x14, 60f\n"
+      "ld1h { z8.h }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 62f\n"
+      "60:"  // Height 5: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x9]\n"
+      "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x27]\n"
+      "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x25]\n"
+      "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x23]\n"
+      "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "b 62f\n"
+      "61:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "62:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "63:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 65f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "b 65f\n"
+      "64:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "65:"  // Height 5: input setup done
+      "cmp x11, #0x8\n"
+      "ble 67f\n"
+      "66:"  // Height 5: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z20.h, z6.h, z3.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x8\n"
+      "fmla z24.h, z6.h, z4.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z21.h, z7.h, z3.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z25.h, z7.h, z4.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z22.h, z6.h, z3.h[0]\n"
+      "fmla z26.h, z6.h, z4.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z23.h, z7.h, z3.h[0]\n"
+      "fmla z27.h, z7.h, z4.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "fmla z20.h, z6.h, z3.h[1]\n"
+      "fmla z24.h, z6.h, z4.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z21.h, z7.h, z3.h[1]\n"
+      "fmla z25.h, z7.h, z4.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z22.h, z6.h, z3.h[1]\n"
+      "fmla z26.h, z6.h, z4.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z23.h, z7.h, z3.h[1]\n"
+      "fmla z27.h, z7.h, z4.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "fmla z20.h, z6.h, z3.h[2]\n"
+      "fmla z24.h, z6.h, z4.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z21.h, z7.h, z3.h[2]\n"
+      "fmla z25.h, z7.h, z4.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z22.h, z6.h, z3.h[2]\n"
+      "fmla z26.h, z6.h, z4.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z23.h, z7.h, z3.h[2]\n"
+      "fmla z27.h, z7.h, z4.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "fmla z20.h, z6.h, z3.h[3]\n"
+      "fmla z24.h, z6.h, z4.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z21.h, z7.h, z3.h[3]\n"
+      "fmla z25.h, z7.h, z4.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z22.h, z6.h, z3.h[3]\n"
+      "fmla z26.h, z6.h, z4.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z23.h, z7.h, z3.h[3]\n"
+      "fmla z27.h, z7.h, z4.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "fmla z20.h, z6.h, z3.h[4]\n"
+      "fmla z24.h, z6.h, z4.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z21.h, z7.h, z3.h[4]\n"
+      "fmla z25.h, z7.h, z4.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z22.h, z6.h, z3.h[4]\n"
+      "fmla z26.h, z6.h, z4.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z23.h, z7.h, z3.h[4]\n"
+      "fmla z27.h, z7.h, z4.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "fmla z20.h, z6.h, z3.h[5]\n"
+      "fmla z24.h, z6.h, z4.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z21.h, z7.h, z3.h[5]\n"
+      "fmla z25.h, z7.h, z4.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z22.h, z6.h, z3.h[5]\n"
+      "fmla z26.h, z6.h, z4.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z23.h, z7.h, z3.h[5]\n"
+      "fmla z27.h, z7.h, z4.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "fmla z20.h, z6.h, z3.h[6]\n"
+      "fmla z24.h, z6.h, z4.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z21.h, z7.h, z3.h[6]\n"
+      "fmla z25.h, z7.h, z4.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z22.h, z6.h, z3.h[6]\n"
+      "fmla z26.h, z6.h, z4.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z23.h, z7.h, z3.h[6]\n"
+      "fmla z27.h, z7.h, z4.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "fmla z20.h, z6.h, z3.h[7]\n"
+      "fmla z24.h, z6.h, z4.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "fmla z21.h, z7.h, z3.h[7]\n"
+      "fmla z25.h, z7.h, z4.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z22.h, z6.h, z3.h[7]\n"
+      "fmla z26.h, z6.h, z4.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z27.h, z7.h, z4.h[7]\n"
+      "bgt 66b\n"
+      "67:"  // Height 5: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "fmla z20.h, z6.h, z3.h[0]\n"
+      "fmla z24.h, z6.h, z4.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z21.h, z7.h, z3.h[0]\n"
+      "fmla z25.h, z7.h, z4.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z22.h, z6.h, z3.h[0]\n"
+      "fmla z26.h, z6.h, z4.h[0]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z23.h, z7.h, z3.h[0]\n"
+      "fmla z27.h, z7.h, z4.h[0]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "fmla z20.h, z6.h, z3.h[1]\n"
+      "fmla z24.h, z6.h, z4.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z21.h, z7.h, z3.h[1]\n"
+      "fmla z25.h, z7.h, z4.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z22.h, z6.h, z3.h[1]\n"
+      "fmla z26.h, z6.h, z4.h[1]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z23.h, z7.h, z3.h[1]\n"
+      "fmla z27.h, z7.h, z4.h[1]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "fmla z20.h, z6.h, z3.h[2]\n"
+      "fmla z24.h, z6.h, z4.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z21.h, z7.h, z3.h[2]\n"
+      "fmla z25.h, z7.h, z4.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z22.h, z6.h, z3.h[2]\n"
+      "fmla z26.h, z6.h, z4.h[2]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z23.h, z7.h, z3.h[2]\n"
+      "fmla z27.h, z7.h, z4.h[2]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "fmla z20.h, z6.h, z3.h[3]\n"
+      "fmla z24.h, z6.h, z4.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z21.h, z7.h, z3.h[3]\n"
+      "fmla z25.h, z7.h, z4.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z22.h, z6.h, z3.h[3]\n"
+      "fmla z26.h, z6.h, z4.h[3]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z23.h, z7.h, z3.h[3]\n"
+      "fmla z27.h, z7.h, z4.h[3]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "fmla z20.h, z6.h, z3.h[4]\n"
+      "fmla z24.h, z6.h, z4.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z21.h, z7.h, z3.h[4]\n"
+      "fmla z25.h, z7.h, z4.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z22.h, z6.h, z3.h[4]\n"
+      "fmla z26.h, z6.h, z4.h[4]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z23.h, z7.h, z3.h[4]\n"
+      "fmla z27.h, z7.h, z4.h[4]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "fmla z20.h, z6.h, z3.h[5]\n"
+      "fmla z24.h, z6.h, z4.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z21.h, z7.h, z3.h[5]\n"
+      "fmla z25.h, z7.h, z4.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z22.h, z6.h, z3.h[5]\n"
+      "fmla z26.h, z6.h, z4.h[5]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z23.h, z7.h, z3.h[5]\n"
+      "fmla z27.h, z7.h, z4.h[5]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "fmla z20.h, z6.h, z3.h[6]\n"
+      "fmla z24.h, z6.h, z4.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z21.h, z7.h, z3.h[6]\n"
+      "fmla z25.h, z7.h, z4.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z22.h, z6.h, z3.h[6]\n"
+      "fmla z26.h, z6.h, z4.h[6]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z23.h, z7.h, z3.h[6]\n"
+      "fmla z27.h, z7.h, z4.h[6]\n"
+      "ble 68f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "fmla z20.h, z6.h, z3.h[7]\n"
+      "fmla z24.h, z6.h, z4.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "fmla z21.h, z7.h, z3.h[7]\n"
+      "fmla z25.h, z7.h, z4.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z22.h, z6.h, z3.h[7]\n"
+      "fmla z26.h, z6.h, z4.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z27.h, z7.h, z4.h[7]\n"
+      "68:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 63b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z1.h }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z0.h }, p5/Z, [x19]\n"
+      "fmin z8.h, p5/M, z8.h, z0.h\n"
+      "fmin z9.h, p5/M, z9.h, z0.h\n"
+      "fmin z10.h, p5/M, z10.h, z0.h\n"
+      "fmin z11.h, p5/M, z11.h, z0.h\n"
+      "fmin z12.h, p5/M, z12.h, z0.h\n"
+      "fmax z8.h, p5/M, z8.h, z1.h\n"
+      "fmax z9.h, p5/M, z9.h, z1.h\n"
+      "fmax z10.h, p5/M, z10.h, z1.h\n"
+      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "fmax z12.h, p5/M, z12.h, z1.h\n"
+      "fmin z13.h, p5/M, z13.h, z0.h\n"
+      "fmin z14.h, p5/M, z14.h, z0.h\n"
+      "fmin z15.h, p5/M, z15.h, z0.h\n"
+      "fmin z16.h, p5/M, z16.h, z0.h\n"
+      "fmax z13.h, p5/M, z13.h, z1.h\n"
+      "fmax z14.h, p5/M, z14.h, z1.h\n"
+      "fmax z15.h, p5/M, z15.h, z1.h\n"
+      "fmax z16.h, p5/M, z16.h, z1.h\n"
+      "fmin z17.h, p5/M, z17.h, z0.h\n"
+      "fmin z18.h, p5/M, z18.h, z0.h\n"
+      "fmin z19.h, p5/M, z19.h, z0.h\n"
+      "fmin z20.h, p5/M, z20.h, z0.h\n"
+      "fmax z17.h, p5/M, z17.h, z1.h\n"
+      "fmax z18.h, p5/M, z18.h, z1.h\n"
+      "fmax z19.h, p5/M, z19.h, z1.h\n"
+      "fmax z20.h, p5/M, z20.h, z1.h\n"
+      "fmin z21.h, p5/M, z21.h, z0.h\n"
+      "fmin z22.h, p5/M, z22.h, z0.h\n"
+      "fmin z23.h, p5/M, z23.h, z0.h\n"
+      "fmin z24.h, p5/M, z24.h, z0.h\n"
+      "fmax z21.h, p5/M, z21.h, z1.h\n"
+      "fmax z22.h, p5/M, z22.h, z1.h\n"
+      "fmax z23.h, p5/M, z23.h, z1.h\n"
+      "fmax z24.h, p5/M, z24.h, z1.h\n"
+      "fmin z25.h, p5/M, z25.h, z0.h\n"
+      "fmin z26.h, p5/M, z26.h, z0.h\n"
+      "fmin z27.h, p5/M, z27.h, z0.h\n"
+      "fmax z25.h, p5/M, z25.h, z1.h\n"
+      "fmax z26.h, p5/M, z26.h, z1.h\n"
+      "fmax z27.h, p5/M, z27.h, z1.h\n"
+      "69:"  // Height 5: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x9]\n"
+      "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z16.h }, p4, [x27]\n"
+      "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1h { z20.h }, p4, [x25]\n"
+      "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1h { z24.h }, p4, [x23]\n"
+      "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
+      "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
+      "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "70:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "inch x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 59b\n"
+      "b 86f\n"
+      "71:"  // Height 6
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #1\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #1\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #1\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x21, x21, x19, LSL #1\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #1\n"
+      "add x27, x9, x19, LSL #1\n"
+      "add x25, x27, x19, LSL #1\n"
+      "add x23, x25, x19, LSL #1\n"
+      "add x21, x23, x19, LSL #1\n"
+      "add %x[output_ptr], x21, x19, LSL #1\n"
+      "73:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p3.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x16\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x16\n"
+      "cbz x14, 74f\n"
+      "ld1h { z8.h }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 76f\n"
+      "74:"  // Height 6: no bias
+      "tbz %x[flags], #0, 75f\n"
+      "ld1h { z8.h }, p4/Z, [x13]\n"
+      "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1h { z12.h }, p4/Z, [x9]\n"
+      "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1h { z16.h }, p4/Z, [x27]\n"
+      "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1h { z20.h }, p4/Z, [x25]\n"
+      "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1h { z24.h }, p4/Z, [x23]\n"
+      "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z28.h }, p4/Z, [x21]\n"
+      "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n"
+      "b 76f\n"
+      "75:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "76:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "77:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 78f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 79f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #1\n"
+      "add x28, x28, x19, LSL #1\n"
+      "add x26, x26, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x20, x20, x19, LSL #1\n"
+      "b 79f\n"
+      "78:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #1\n"
+      "add x26, x28, x19, LSL #1\n"
+      "add x24, x26, x19, LSL #1\n"
+      "add x22, x24, x19, LSL #1\n"
+      "add x20, x22, x19, LSL #1\n"
+      "79:"  // Height 6: input setup done
+      "cmp x11, #0x8\n"
+      "ble 81f\n"
+      "80:"  // Height 6: Multiply loop: Main loop head
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x8\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "ld1rqh { z5.h }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z20.h, z6.h, z3.h[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z24.h, z6.h, z4.h[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x8\n"
+      "fmla z28.h, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z21.h, z7.h, z3.h[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z25.h, z7.h, z4.h[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla z29.h, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z22.h, z6.h, z3.h[0]\n"
+      "fmla z26.h, z6.h, z4.h[0]\n"
+      "fmla z30.h, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z23.h, z7.h, z3.h[0]\n"
+      "fmla z27.h, z7.h, z4.h[0]\n"
+      "fmla z31.h, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "fmla z20.h, z6.h, z3.h[1]\n"
+      "fmla z24.h, z6.h, z4.h[1]\n"
+      "fmla z28.h, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z21.h, z7.h, z3.h[1]\n"
+      "fmla z25.h, z7.h, z4.h[1]\n"
+      "fmla z29.h, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z22.h, z6.h, z3.h[1]\n"
+      "fmla z26.h, z6.h, z4.h[1]\n"
+      "fmla z30.h, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z23.h, z7.h, z3.h[1]\n"
+      "fmla z27.h, z7.h, z4.h[1]\n"
+      "fmla z31.h, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "fmla z20.h, z6.h, z3.h[2]\n"
+      "fmla z24.h, z6.h, z4.h[2]\n"
+      "fmla z28.h, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z21.h, z7.h, z3.h[2]\n"
+      "fmla z25.h, z7.h, z4.h[2]\n"
+      "fmla z29.h, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z22.h, z6.h, z3.h[2]\n"
+      "fmla z26.h, z6.h, z4.h[2]\n"
+      "fmla z30.h, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z23.h, z7.h, z3.h[2]\n"
+      "fmla z27.h, z7.h, z4.h[2]\n"
+      "fmla z31.h, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "fmla z20.h, z6.h, z3.h[3]\n"
+      "fmla z24.h, z6.h, z4.h[3]\n"
+      "fmla z28.h, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z21.h, z7.h, z3.h[3]\n"
+      "fmla z25.h, z7.h, z4.h[3]\n"
+      "fmla z29.h, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z22.h, z6.h, z3.h[3]\n"
+      "fmla z26.h, z6.h, z4.h[3]\n"
+      "fmla z30.h, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z23.h, z7.h, z3.h[3]\n"
+      "fmla z27.h, z7.h, z4.h[3]\n"
+      "fmla z31.h, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "fmla z20.h, z6.h, z3.h[4]\n"
+      "fmla z24.h, z6.h, z4.h[4]\n"
+      "fmla z28.h, z6.h, z5.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z21.h, z7.h, z3.h[4]\n"
+      "fmla z25.h, z7.h, z4.h[4]\n"
+      "fmla z29.h, z7.h, z5.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z22.h, z6.h, z3.h[4]\n"
+      "fmla z26.h, z6.h, z4.h[4]\n"
+      "fmla z30.h, z6.h, z5.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z23.h, z7.h, z3.h[4]\n"
+      "fmla z27.h, z7.h, z4.h[4]\n"
+      "fmla z31.h, z7.h, z5.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "fmla z20.h, z6.h, z3.h[5]\n"
+      "fmla z24.h, z6.h, z4.h[5]\n"
+      "fmla z28.h, z6.h, z5.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z21.h, z7.h, z3.h[5]\n"
+      "fmla z25.h, z7.h, z4.h[5]\n"
+      "fmla z29.h, z7.h, z5.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z22.h, z6.h, z3.h[5]\n"
+      "fmla z26.h, z6.h, z4.h[5]\n"
+      "fmla z30.h, z6.h, z5.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z23.h, z7.h, z3.h[5]\n"
+      "fmla z27.h, z7.h, z4.h[5]\n"
+      "fmla z31.h, z7.h, z5.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "fmla z20.h, z6.h, z3.h[6]\n"
+      "fmla z24.h, z6.h, z4.h[6]\n"
+      "fmla z28.h, z6.h, z5.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z21.h, z7.h, z3.h[6]\n"
+      "fmla z25.h, z7.h, z4.h[6]\n"
+      "fmla z29.h, z7.h, z5.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z22.h, z6.h, z3.h[6]\n"
+      "fmla z26.h, z6.h, z4.h[6]\n"
+      "fmla z30.h, z6.h, z5.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z23.h, z7.h, z3.h[6]\n"
+      "fmla z27.h, z7.h, z4.h[6]\n"
+      "fmla z31.h, z7.h, z5.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "fmla z20.h, z6.h, z3.h[7]\n"
+      "fmla z24.h, z6.h, z4.h[7]\n"
+      "fmla z28.h, z6.h, z5.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "fmla z21.h, z7.h, z3.h[7]\n"
+      "fmla z25.h, z7.h, z4.h[7]\n"
+      "fmla z29.h, z7.h, z5.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z22.h, z6.h, z3.h[7]\n"
+      "fmla z26.h, z6.h, z4.h[7]\n"
+      "fmla z30.h, z6.h, z5.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z27.h, z7.h, z4.h[7]\n"
+      "fmla z31.h, z7.h, z5.h[7]\n"
+      "bgt 80b\n"
+      "81:"  // Height 6: Multiply loop: Single iteration only
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "whilelt p0.h, XZR, x11\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqh { z0.h }, p0/Z, [x10]\n"
+      "fmla z8.h, z6.h, z0.h[0]\n"
+      "ld1rqh { z1.h }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.h, z7.h, z0.h[0]\n"
+      "ld1rqh { z2.h }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
+      "ld1rqh { z3.h }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
+      "ld1rqh { z5.h }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z20.h, z6.h, z3.h[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
+      "fmla z24.h, z6.h, z4.h[0]\n"
+      "fmla z28.h, z6.h, z5.h[0]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z21.h, z7.h, z3.h[0]\n"
+      "fmla z25.h, z7.h, z4.h[0]\n"
+      "fmla z29.h, z7.h, z5.h[0]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[0]\n"
+      "fmla z14.h, z6.h, z1.h[0]\n"
+      "fmla z18.h, z6.h, z2.h[0]\n"
+      "fmla z22.h, z6.h, z3.h[0]\n"
+      "fmla z26.h, z6.h, z4.h[0]\n"
+      "fmla z30.h, z6.h, z5.h[0]\n"
+      "fmla z11.h, z7.h, z0.h[0]\n"
+      "fmla z15.h, z7.h, z1.h[0]\n"
+      "fmla z19.h, z7.h, z2.h[0]\n"
+      "fmla z23.h, z7.h, z3.h[0]\n"
+      "fmla z27.h, z7.h, z4.h[0]\n"
+      "fmla z31.h, z7.h, z5.h[0]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z16.h, z6.h, z2.h[1]\n"
+      "fmla z20.h, z6.h, z3.h[1]\n"
+      "fmla z24.h, z6.h, z4.h[1]\n"
+      "fmla z28.h, z6.h, z5.h[1]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[1]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z17.h, z7.h, z2.h[1]\n"
+      "fmla z21.h, z7.h, z3.h[1]\n"
+      "fmla z25.h, z7.h, z4.h[1]\n"
+      "fmla z29.h, z7.h, z5.h[1]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[1]\n"
+      "fmla z14.h, z6.h, z1.h[1]\n"
+      "fmla z18.h, z6.h, z2.h[1]\n"
+      "fmla z22.h, z6.h, z3.h[1]\n"
+      "fmla z26.h, z6.h, z4.h[1]\n"
+      "fmla z30.h, z6.h, z5.h[1]\n"
+      "fmla z11.h, z7.h, z0.h[1]\n"
+      "fmla z15.h, z7.h, z1.h[1]\n"
+      "fmla z19.h, z7.h, z2.h[1]\n"
+      "fmla z23.h, z7.h, z3.h[1]\n"
+      "fmla z27.h, z7.h, z4.h[1]\n"
+      "fmla z31.h, z7.h, z5.h[1]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[2]\n"
+      "fmla z16.h, z6.h, z2.h[2]\n"
+      "fmla z20.h, z6.h, z3.h[2]\n"
+      "fmla z24.h, z6.h, z4.h[2]\n"
+      "fmla z28.h, z6.h, z5.h[2]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[2]\n"
+      "fmla z13.h, z7.h, z1.h[2]\n"
+      "fmla z17.h, z7.h, z2.h[2]\n"
+      "fmla z21.h, z7.h, z3.h[2]\n"
+      "fmla z25.h, z7.h, z4.h[2]\n"
+      "fmla z29.h, z7.h, z5.h[2]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[2]\n"
+      "fmla z14.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z2.h[2]\n"
+      "fmla z22.h, z6.h, z3.h[2]\n"
+      "fmla z26.h, z6.h, z4.h[2]\n"
+      "fmla z30.h, z6.h, z5.h[2]\n"
+      "fmla z11.h, z7.h, z0.h[2]\n"
+      "fmla z15.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z2.h[2]\n"
+      "fmla z23.h, z7.h, z3.h[2]\n"
+      "fmla z27.h, z7.h, z4.h[2]\n"
+      "fmla z31.h, z7.h, z5.h[2]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[3]\n"
+      "fmla z16.h, z6.h, z2.h[3]\n"
+      "fmla z20.h, z6.h, z3.h[3]\n"
+      "fmla z24.h, z6.h, z4.h[3]\n"
+      "fmla z28.h, z6.h, z5.h[3]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[3]\n"
+      "fmla z13.h, z7.h, z1.h[3]\n"
+      "fmla z17.h, z7.h, z2.h[3]\n"
+      "fmla z21.h, z7.h, z3.h[3]\n"
+      "fmla z25.h, z7.h, z4.h[3]\n"
+      "fmla z29.h, z7.h, z5.h[3]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[3]\n"
+      "fmla z14.h, z6.h, z1.h[3]\n"
+      "fmla z18.h, z6.h, z2.h[3]\n"
+      "fmla z22.h, z6.h, z3.h[3]\n"
+      "fmla z26.h, z6.h, z4.h[3]\n"
+      "fmla z30.h, z6.h, z5.h[3]\n"
+      "fmla z11.h, z7.h, z0.h[3]\n"
+      "fmla z15.h, z7.h, z1.h[3]\n"
+      "fmla z19.h, z7.h, z2.h[3]\n"
+      "fmla z23.h, z7.h, z3.h[3]\n"
+      "fmla z27.h, z7.h, z4.h[3]\n"
+      "fmla z31.h, z7.h, z5.h[3]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[4]\n"
+      "fmla z16.h, z6.h, z2.h[4]\n"
+      "fmla z20.h, z6.h, z3.h[4]\n"
+      "fmla z24.h, z6.h, z4.h[4]\n"
+      "fmla z28.h, z6.h, z5.h[4]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[4]\n"
+      "fmla z13.h, z7.h, z1.h[4]\n"
+      "fmla z17.h, z7.h, z2.h[4]\n"
+      "fmla z21.h, z7.h, z3.h[4]\n"
+      "fmla z25.h, z7.h, z4.h[4]\n"
+      "fmla z29.h, z7.h, z5.h[4]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[4]\n"
+      "fmla z14.h, z6.h, z1.h[4]\n"
+      "fmla z18.h, z6.h, z2.h[4]\n"
+      "fmla z22.h, z6.h, z3.h[4]\n"
+      "fmla z26.h, z6.h, z4.h[4]\n"
+      "fmla z30.h, z6.h, z5.h[4]\n"
+      "fmla z11.h, z7.h, z0.h[4]\n"
+      "fmla z15.h, z7.h, z1.h[4]\n"
+      "fmla z19.h, z7.h, z2.h[4]\n"
+      "fmla z23.h, z7.h, z3.h[4]\n"
+      "fmla z27.h, z7.h, z4.h[4]\n"
+      "fmla z31.h, z7.h, z5.h[4]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[5]\n"
+      "fmla z16.h, z6.h, z2.h[5]\n"
+      "fmla z20.h, z6.h, z3.h[5]\n"
+      "fmla z24.h, z6.h, z4.h[5]\n"
+      "fmla z28.h, z6.h, z5.h[5]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[5]\n"
+      "fmla z13.h, z7.h, z1.h[5]\n"
+      "fmla z17.h, z7.h, z2.h[5]\n"
+      "fmla z21.h, z7.h, z3.h[5]\n"
+      "fmla z25.h, z7.h, z4.h[5]\n"
+      "fmla z29.h, z7.h, z5.h[5]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[5]\n"
+      "fmla z14.h, z6.h, z1.h[5]\n"
+      "fmla z18.h, z6.h, z2.h[5]\n"
+      "fmla z22.h, z6.h, z3.h[5]\n"
+      "fmla z26.h, z6.h, z4.h[5]\n"
+      "fmla z30.h, z6.h, z5.h[5]\n"
+      "fmla z11.h, z7.h, z0.h[5]\n"
+      "fmla z15.h, z7.h, z1.h[5]\n"
+      "fmla z19.h, z7.h, z2.h[5]\n"
+      "fmla z23.h, z7.h, z3.h[5]\n"
+      "fmla z27.h, z7.h, z4.h[5]\n"
+      "fmla z31.h, z7.h, z5.h[5]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.h, z6.h, z1.h[6]\n"
+      "fmla z16.h, z6.h, z2.h[6]\n"
+      "fmla z20.h, z6.h, z3.h[6]\n"
+      "fmla z24.h, z6.h, z4.h[6]\n"
+      "fmla z28.h, z6.h, z5.h[6]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[6]\n"
+      "fmla z13.h, z7.h, z1.h[6]\n"
+      "fmla z17.h, z7.h, z2.h[6]\n"
+      "fmla z21.h, z7.h, z3.h[6]\n"
+      "fmla z25.h, z7.h, z4.h[6]\n"
+      "fmla z29.h, z7.h, z5.h[6]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[6]\n"
+      "fmla z14.h, z6.h, z1.h[6]\n"
+      "fmla z18.h, z6.h, z2.h[6]\n"
+      "fmla z22.h, z6.h, z3.h[6]\n"
+      "fmla z26.h, z6.h, z4.h[6]\n"
+      "fmla z30.h, z6.h, z5.h[6]\n"
+      "fmla z11.h, z7.h, z0.h[6]\n"
+      "fmla z15.h, z7.h, z1.h[6]\n"
+      "fmla z19.h, z7.h, z2.h[6]\n"
+      "fmla z23.h, z7.h, z3.h[6]\n"
+      "fmla z27.h, z7.h, z4.h[6]\n"
+      "fmla z31.h, z7.h, z5.h[6]\n"
+      "ble 82f\n"
+      "ld1h { z6.h }, p5/Z, [x15]\n"
+      "fmla z8.h, z6.h, z0.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.h, z6.h, z1.h[7]\n"
+      "fmla z16.h, z6.h, z2.h[7]\n"
+      "fmla z20.h, z6.h, z3.h[7]\n"
+      "fmla z24.h, z6.h, z4.h[7]\n"
+      "fmla z28.h, z6.h, z5.h[7]\n"
+      "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.h, z7.h, z0.h[7]\n"
+      "fmla z13.h, z7.h, z1.h[7]\n"
+      "fmla z17.h, z7.h, z2.h[7]\n"
+      "fmla z21.h, z7.h, z3.h[7]\n"
+      "fmla z25.h, z7.h, z4.h[7]\n"
+      "fmla z29.h, z7.h, z5.h[7]\n"
+      "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.h, z6.h, z0.h[7]\n"
+      "fmla z14.h, z6.h, z1.h[7]\n"
+      "fmla z18.h, z6.h, z2.h[7]\n"
+      "fmla z22.h, z6.h, z3.h[7]\n"
+      "fmla z26.h, z6.h, z4.h[7]\n"
+      "fmla z30.h, z6.h, z5.h[7]\n"
+      "fmla z11.h, z7.h, z0.h[7]\n"
+      "fmla z15.h, z7.h, z1.h[7]\n"
+      "fmla z19.h, z7.h, z2.h[7]\n"
+      "fmla z23.h, z7.h, z3.h[7]\n"
+      "fmla z27.h, z7.h, z4.h[7]\n"
+      "fmla z31.h, z7.h, z5.h[7]\n"
+      "82:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 77b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z1.h }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z0.h }, p5/Z, [x19]\n"
+      "fmin z8.h, p5/M, z8.h, z0.h\n"
+      "fmin z9.h, p5/M, z9.h, z0.h\n"
+      "fmin z10.h, p5/M, z10.h, z0.h\n"
+      "fmin z11.h, p5/M, z11.h, z0.h\n"
+      "fmin z12.h, p5/M, z12.h, z0.h\n"
+      "fmax z8.h, p5/M, z8.h, z1.h\n"
+      "fmax z9.h, p5/M, z9.h, z1.h\n"
+      "fmax z10.h, p5/M, z10.h, z1.h\n"
+      "fmax z11.h, p5/M, z11.h, z1.h\n"
+      "fmax z12.h, p5/M, z12.h, z1.h\n"
+      "fmin z13.h, p5/M, z13.h, z0.h\n"
+      "fmin z14.h, p5/M, z14.h, z0.h\n"
+      "fmin z15.h, p5/M, z15.h, z0.h\n"
+      "fmin z16.h, p5/M, z16.h, z0.h\n"
+      "fmax z13.h, p5/M, z13.h, z1.h\n"
+      "fmax z14.h, p5/M, z14.h, z1.h\n"
+      "fmax z15.h, p5/M, z15.h, z1.h\n"
+      "fmax z16.h, p5/M, z16.h, z1.h\n"
+      "fmin z17.h, p5/M, z17.h, z0.h\n"
+      "fmin z18.h, p5/M, z18.h, z0.h\n"
+      "fmin z19.h, p5/M, z19.h, z0.h\n"
+      "fmin z20.h, p5/M, z20.h, z0.h\n"
+      "fmax z17.h, p5/M, z17.h, z1.h\n"
+      "fmax z18.h, p5/M, z18.h, z1.h\n"
+      "fmax z19.h, p5/M, z19.h, z1.h\n"
+      "fmax z20.h, p5/M, z20.h, z1.h\n"
+      "fmin z21.h, p5/M, z21.h, z0.h\n"
+      "fmin z22.h, p5/M, z22.h, z0.h\n"
+      "fmin z23.h, p5/M, z23.h, z0.h\n"
+      "fmin z24.h, p5/M, z24.h, z0.h\n"
+      "fmax z21.h, p5/M, z21.h, z1.h\n"
+      "fmax z22.h, p5/M, z22.h, z1.h\n"
+      "fmax z23.h, p5/M, z23.h, z1.h\n"
+      "fmax z24.h, p5/M, z24.h, z1.h\n"
+      "fmin z25.h, p5/M, z25.h, z0.h\n"
+      "fmin z26.h, p5/M, z26.h, z0.h\n"
+      "fmin z27.h, p5/M, z27.h, z0.h\n"
+      "fmin z28.h, p5/M, z28.h, z0.h\n"
+      "fmax z25.h, p5/M, z25.h, z1.h\n"
+      "fmax z26.h, p5/M, z26.h, z1.h\n"
+      "fmax z27.h, p5/M, z27.h, z1.h\n"
+      "fmax z28.h, p5/M, z28.h, z1.h\n"
+      "fmin z29.h, p5/M, z29.h, z0.h\n"
+      "fmin z30.h, p5/M, z30.h, z0.h\n"
+      "fmin z31.h, p5/M, z31.h, z0.h\n"
+      "fmax z29.h, p5/M, z29.h, z1.h\n"
+      "fmax z30.h, p5/M, z30.h, z1.h\n"
+      "fmax z31.h, p5/M, z31.h, z1.h\n"
+      "83:"  // Height 6: No activation
+      "st1h { z8.h }, p4, [x13]\n"
+      "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+      "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+      "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1h { z12.h }, p4, [x9]\n"
+      "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+      "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1h { z16.h }, p4, [x27]\n"
+      "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+      "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+      "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1h { z20.h }, p4, [x25]\n"
+      "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
+      "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
+      "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1h { z24.h }, p4, [x23]\n"
+      "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
+      "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
+      "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "st1h { z28.h }, p4, [x21]\n"
+      "st1h { z29.h }, p3, [x21, #1, MUL VL]\n"
+      "st1h { z30.h }, p2, [x21, #2, MUL VL]\n"
+      "st1h { z31.h }, p1, [x21, #3, MUL VL]\n"
+      "addvl x21, x21, #4\n"
+      "84:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "inch x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 73b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 86f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "85:"  // Update direct input
+      "mov x19, #0xc\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "86:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
deleted file mode 100644
index ce3624340e..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2118 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = K;
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long leftovers = K;
-    float nullbias[256];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = leftovers;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z24.s, z12.s, z6.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z25.s, z13.s, z6.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z26.s, z14.s, z6.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "fmla z27.s, z15.s, z6.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z24.s, z12.s, z6.s[3]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z25.s, z13.s, z6.s[3]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z26.s, z14.s, z6.s[3]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "fmla z27.s, z15.s, z6.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z28.d, z16.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z29.d, z17.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z30.d, z18.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z31.d, z19.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1w z28.s, p0/z, [c_ptr3]\n"
-                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z28.s, z8.s, z3.s[0]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z29.s, z9.s, z3.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla z30.s, z10.s, z3.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "fmla z31.s, z11.s, z3.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z28.s, z12.s, z3.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z29.s, z13.s, z3.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z30.s, z14.s, z3.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "fmla z31.s, z15.s, z3.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z28.s, z8.s, z3.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z29.s, z9.s, z3.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z30.s, z10.s, z3.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z31.s, z11.s, z3.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "fmla z28.s, z12.s, z3.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "fmla z29.s, z13.s, z3.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "fmla z30.s, z14.s, z3.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        "fmla z31.s, z15.s, z3.s[3]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "fmla z28.s, z8.s, z7.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "fmla z29.s, z9.s, z7.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "fmla z30.s, z10.s, z7.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "fmla z31.s, z11.s, z7.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "fmla z28.s, z12.s, z7.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "fmla z29.s, z13.s, z7.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "fmla z30.s, z14.s, z7.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "fmla z31.s, z15.s, z7.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z28.s, z8.s, z7.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z29.s, z9.s, z7.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z30.s, z10.s, z7.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "fmla z31.s, z11.s, z7.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z24.s, z12.s, z6.s[3]\n"
-                        "fmla z28.s, z12.s, z7.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z25.s, z13.s, z6.s[3]\n"
-                        "fmla z29.s, z13.s, z7.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z26.s, z14.s, z6.s[3]\n"
-                        "fmla z30.s, z14.s, z7.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "fmla z27.s, z15.s, z6.s[3]\n"
-                        "fmla z31.s, z15.s, z7.s[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z28.s, z8.s, z3.s[0]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "fmla z29.s, z9.s, z3.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "fmla z30.s, z10.s, z3.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "fmla z31.s, z11.s, z3.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z28.s, z12.s, z3.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z29.s, z13.s, z3.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z30.s, z14.s, z3.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "fmla z31.s, z15.s, z3.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z28.s, z8.s, z3.s[2]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z29.s, z9.s, z3.s[2]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z30.s, z10.s, z3.s[2]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z31.s, z11.s, z3.s[2]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "fmla z28.s, z12.s, z3.s[3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "fmla z29.s, z13.s, z3.s[3]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "fmla z30.s, z14.s, z3.s[3]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        "fmla z31.s, z15.s, z3.s[3]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z28.s, z8.s, z7.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "fmla z29.s, z9.s, z7.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "fmla z30.s, z10.s, z7.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "fmla z31.s, z11.s, z7.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "fmla z28.s, z12.s, z7.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "fmla z29.s, z13.s, z7.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "fmla z30.s, z14.s, z7.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "fmla z31.s, z15.s, z7.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z28.s, z8.s, z7.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z29.s, z9.s, z7.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z30.s, z10.s, z7.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "fmla z31.s, z11.s, z7.s[2]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z24.s, z12.s, z6.s[3]\n"
-                        "fmla z28.s, z12.s, z7.s[3]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z25.s, z13.s, z6.s[3]\n"
-                        "fmla z29.s, z13.s, z7.s[3]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z26.s, z14.s, z6.s[3]\n"
-                        "fmla z30.s, z14.s, z7.s[3]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "fmla z27.s, z15.s, z6.s[3]\n"
-                        "fmla z31.s, z15.s, z7.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "fmla z28.s, z8.s, z3.s[0]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "fmla z29.s, z9.s, z3.s[0]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "fmla z30.s, z10.s, z3.s[0]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "fmla z31.s, z11.s, z3.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z28.s, z12.s, z3.s[1]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z29.s, z13.s, z3.s[1]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z30.s, z14.s, z3.s[1]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "fmla z31.s, z15.s, z3.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z28.s, z8.s, z3.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z29.s, z9.s, z3.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z30.s, z10.s, z3.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z31.s, z11.s, z3.s[2]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        "fmla z28.s, z8.s, z3.s[0]\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z29.s, z9.s, z3.s[0]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        "fmla z30.s, z10.s, z3.s[0]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "fmla z31.s, z11.s, z3.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z28.s, z12.s, z3.s[1]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z29.s, z13.s, z3.s[1]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z30.s, z14.s, z3.s[1]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "fmla z31.s, z15.s, z3.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z28.s, z8.s, z3.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z29.s, z9.s, z3.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z30.s, z10.s, z3.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z31.s, z11.s, z3.s[2]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "fmla z28.s, z12.s, z3.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "fmla z29.s, z13.s, z3.s[3]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "fmla z30.s, z14.s, z3.s[3]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "fmla z31.s, z15.s, z3.s[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "fmla z28.s, z8.s, z7.s[0]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "fmla z29.s, z9.s, z7.s[0]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "fmla z30.s, z10.s, z7.s[0]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "fmla z31.s, z11.s, z7.s[0]\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "fmla z28.s, z12.s, z7.s[1]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "fmla z29.s, z13.s, z7.s[1]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "fmla z30.s, z14.s, z7.s[1]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "fmla z31.s, z15.s, z7.s[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z28.s, z8.s, z7.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z29.s, z9.s, z7.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z30.s, z10.s, z7.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "fmla z31.s, z11.s, z7.s[2]\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z28.s, p0, [c_ptr3]\n"
-                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
new file mode 100644
index 0000000000..f0cc70b76e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<float>, \
+   size_t, size_t, \
+   const float *, \
+   IndirectOutputArg<float>, \
+   const float *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp32_mla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_fp32_mla_6x4VL
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 6, 4, 1> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_fp32_mla_6x4VL;
+
+    cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..3a6422abd1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
@@ -0,0 +1,2236 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 4f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 9f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "cmp x11, #0x4\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop head
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "cmp x11, #0x4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "ble 12f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "ble 12f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "ble 12f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "12:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 7b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 13f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "13:"  // Height 1: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "14:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 3b\n"
+      "b 86f\n"
+      "15:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 16f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "17:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 18f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "b 20f\n"
+      "18:"  // Height 2: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "b 20f\n"
+      "19:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "20:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "21:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 23f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "b 23f\n"
+      "22:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "23:"  // Height 2: input setup done
+      "cmp x11, #0x4\n"
+      "ble 25f\n"
+      "24:"  // Height 2: Multiply loop: Main loop head
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "cmp x11, #0x4\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "bgt 24b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "ble 26f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "ble 26f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "ble 26f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 21b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "27:"  // Height 2: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "28:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 17b\n"
+      "b 86f\n"
+      "29:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 30f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 31f\n"
+      "30:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "31:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 32f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "mov z13.d, z9.d\n"
+      "addvl x14, x14, #4\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 34f\n"
+      "32:"  // Height 3: no bias
+      "tbz %x[flags], #0, 33f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "b 34f\n"
+      "33:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "37:"  // Height 3: input setup done
+      "cmp x11, #0x4\n"
+      "ble 39f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "cmp x11, #0x4\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "bgt 38b\n"
+      "39:"  // Height 3: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "ble 40f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "ble 40f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "ble 40f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "40:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 35b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 41f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "41:"  // Height 3: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "42:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 31b\n"
+      "b 86f\n"
+      "43:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 44f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 45f\n"
+      "44:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "45:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 46f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 48f\n"
+      "46:"  // Height 4: no bias
+      "tbz %x[flags], #0, 47f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "b 48f\n"
+      "47:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "48:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "49:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 51f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 51f\n"
+      "50:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "51:"  // Height 4: input setup done
+      "cmp x11, #0x4\n"
+      "ble 53f\n"
+      "52:"  // Height 4: Multiply loop: Main loop head
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x4\n"
+      "fmla z20.s, z6.s, z3.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z21.s, z7.s, z3.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z23.s, z7.s, z3.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "fmla z20.s, z6.s, z3.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z21.s, z7.s, z3.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z22.s, z6.s, z3.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z23.s, z7.s, z3.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z20.s, z6.s, z3.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z21.s, z7.s, z3.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z22.s, z6.s, z3.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z23.s, z7.s, z3.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "fmla z20.s, z6.s, z3.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "fmla z21.s, z7.s, z3.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z23.s, z7.s, z3.s[3]\n"
+      "bgt 52b\n"
+      "53:"  // Height 4: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "fmla z20.s, z6.s, z3.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z21.s, z7.s, z3.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z23.s, z7.s, z3.s[0]\n"
+      "ble 54f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "fmla z20.s, z6.s, z3.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z21.s, z7.s, z3.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z22.s, z6.s, z3.s[1]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z23.s, z7.s, z3.s[1]\n"
+      "ble 54f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z20.s, z6.s, z3.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z21.s, z7.s, z3.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z22.s, z6.s, z3.s[2]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z23.s, z7.s, z3.s[2]\n"
+      "ble 54f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "fmla z20.s, z6.s, z3.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "fmla z21.s, z7.s, z3.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z23.s, z7.s, z3.s[3]\n"
+      "54:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 49b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 55f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "55:"  // Height 4: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "56:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 45b\n"
+      "b 86f\n"
+      "57:"  // Height 5
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 58f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 59f\n"
+      "58:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "59:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 60f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 62f\n"
+      "60:"  // Height 5: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "b 62f\n"
+      "61:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "62:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "63:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 65f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 65f\n"
+      "64:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "65:"  // Height 5: input setup done
+      "cmp x11, #0x4\n"
+      "ble 67f\n"
+      "66:"  // Height 5: Multiply loop: Main loop head
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z20.s, z6.s, z3.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x4\n"
+      "fmla z24.s, z6.s, z4.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z21.s, z7.s, z3.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z25.s, z7.s, z4.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "fmla z26.s, z6.s, z4.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z23.s, z7.s, z3.s[0]\n"
+      "fmla z27.s, z7.s, z4.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "fmla z20.s, z6.s, z3.s[1]\n"
+      "fmla z24.s, z6.s, z4.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z21.s, z7.s, z3.s[1]\n"
+      "fmla z25.s, z7.s, z4.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z22.s, z6.s, z3.s[1]\n"
+      "fmla z26.s, z6.s, z4.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z23.s, z7.s, z3.s[1]\n"
+      "fmla z27.s, z7.s, z4.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z20.s, z6.s, z3.s[2]\n"
+      "fmla z24.s, z6.s, z4.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z21.s, z7.s, z3.s[2]\n"
+      "fmla z25.s, z7.s, z4.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z22.s, z6.s, z3.s[2]\n"
+      "fmla z26.s, z6.s, z4.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z23.s, z7.s, z3.s[2]\n"
+      "fmla z27.s, z7.s, z4.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "fmla z20.s, z6.s, z3.s[3]\n"
+      "fmla z24.s, z6.s, z4.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "fmla z21.s, z7.s, z3.s[3]\n"
+      "fmla z25.s, z7.s, z4.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[3]\n"
+      "fmla z26.s, z6.s, z4.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z27.s, z7.s, z4.s[3]\n"
+      "bgt 66b\n"
+      "67:"  // Height 5: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "fmla z20.s, z6.s, z3.s[0]\n"
+      "fmla z24.s, z6.s, z4.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z21.s, z7.s, z3.s[0]\n"
+      "fmla z25.s, z7.s, z4.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "fmla z26.s, z6.s, z4.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z23.s, z7.s, z3.s[0]\n"
+      "fmla z27.s, z7.s, z4.s[0]\n"
+      "ble 68f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "fmla z20.s, z6.s, z3.s[1]\n"
+      "fmla z24.s, z6.s, z4.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z21.s, z7.s, z3.s[1]\n"
+      "fmla z25.s, z7.s, z4.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z22.s, z6.s, z3.s[1]\n"
+      "fmla z26.s, z6.s, z4.s[1]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z23.s, z7.s, z3.s[1]\n"
+      "fmla z27.s, z7.s, z4.s[1]\n"
+      "ble 68f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z20.s, z6.s, z3.s[2]\n"
+      "fmla z24.s, z6.s, z4.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z21.s, z7.s, z3.s[2]\n"
+      "fmla z25.s, z7.s, z4.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z22.s, z6.s, z3.s[2]\n"
+      "fmla z26.s, z6.s, z4.s[2]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z23.s, z7.s, z3.s[2]\n"
+      "fmla z27.s, z7.s, z4.s[2]\n"
+      "ble 68f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "fmla z20.s, z6.s, z3.s[3]\n"
+      "fmla z24.s, z6.s, z4.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "fmla z21.s, z7.s, z3.s[3]\n"
+      "fmla z25.s, z7.s, z4.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[3]\n"
+      "fmla z26.s, z6.s, z4.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z27.s, z7.s, z4.s[3]\n"
+      "68:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 63b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "fmax z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z1.s\n"
+      "fmax z26.s, p5/M, z26.s, z1.s\n"
+      "fmax z27.s, p5/M, z27.s, z1.s\n"
+      "69:"  // Height 5: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "70:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 59b\n"
+      "b 86f\n"
+      "71:"  // Height 6
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "73:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x16\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x16\n"
+      "cbz x14, 74f\n"
+      "ld1w { z8.s }, p5/Z, [x14]\n"
+      "mov z12.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+      "mov z20.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "mov z13.d, z9.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 76f\n"
+      "74:"  // Height 6: no bias
+      "tbz %x[flags], #0, 75f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x21]\n"
+      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "b 76f\n"
+      "75:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "76:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "77:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 78f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 79f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 79f\n"
+      "78:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "add x20, x22, x19, LSL #2\n"
+      "79:"  // Height 6: input setup done
+      "cmp x11, #0x4\n"
+      "ble 81f\n"
+      "80:"  // Height 6: Multiply loop: Main loop head
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "sub x11, x11, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z20.s, z6.s, z3.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z24.s, z6.s, z4.s[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x4\n"
+      "fmla z28.s, z6.s, z5.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z21.s, z7.s, z3.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z25.s, z7.s, z4.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla z29.s, z7.s, z5.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "fmla z26.s, z6.s, z4.s[0]\n"
+      "fmla z30.s, z6.s, z5.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z23.s, z7.s, z3.s[0]\n"
+      "fmla z27.s, z7.s, z4.s[0]\n"
+      "fmla z31.s, z7.s, z5.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "fmla z20.s, z6.s, z3.s[1]\n"
+      "fmla z24.s, z6.s, z4.s[1]\n"
+      "fmla z28.s, z6.s, z5.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z21.s, z7.s, z3.s[1]\n"
+      "fmla z25.s, z7.s, z4.s[1]\n"
+      "fmla z29.s, z7.s, z5.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+      "addvl x15, x15, #16\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z22.s, z6.s, z3.s[1]\n"
+      "fmla z26.s, z6.s, z4.s[1]\n"
+      "fmla z30.s, z6.s, z5.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z23.s, z7.s, z3.s[1]\n"
+      "fmla z27.s, z7.s, z4.s[1]\n"
+      "fmla z31.s, z7.s, z5.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z20.s, z6.s, z3.s[2]\n"
+      "fmla z24.s, z6.s, z4.s[2]\n"
+      "fmla z28.s, z6.s, z5.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z21.s, z7.s, z3.s[2]\n"
+      "fmla z25.s, z7.s, z4.s[2]\n"
+      "fmla z29.s, z7.s, z5.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z22.s, z6.s, z3.s[2]\n"
+      "fmla z26.s, z6.s, z4.s[2]\n"
+      "fmla z30.s, z6.s, z5.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z23.s, z7.s, z3.s[2]\n"
+      "fmla z27.s, z7.s, z4.s[2]\n"
+      "fmla z31.s, z7.s, z5.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "fmla z20.s, z6.s, z3.s[3]\n"
+      "fmla z24.s, z6.s, z4.s[3]\n"
+      "fmla z28.s, z6.s, z5.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "fmla z21.s, z7.s, z3.s[3]\n"
+      "fmla z25.s, z7.s, z4.s[3]\n"
+      "fmla z29.s, z7.s, z5.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[3]\n"
+      "fmla z26.s, z6.s, z4.s[3]\n"
+      "fmla z30.s, z6.s, z5.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z27.s, z7.s, z4.s[3]\n"
+      "fmla z31.s, z7.s, z5.s[3]\n"
+      "bgt 80b\n"
+      "81:"  // Height 6: Multiply loop: Single iteration only
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "whilelt p0.s, XZR, x11\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x10]\n"
+      "fmla z8.s, z6.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z9.s, z7.s, z0.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z20.s, z6.s, z3.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
+      "fmla z24.s, z6.s, z4.s[0]\n"
+      "fmla z28.s, z6.s, z5.s[0]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z21.s, z7.s, z3.s[0]\n"
+      "fmla z25.s, z7.s, z4.s[0]\n"
+      "fmla z29.s, z7.s, z5.s[0]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z14.s, z6.s, z1.s[0]\n"
+      "fmla z18.s, z6.s, z2.s[0]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "fmla z26.s, z6.s, z4.s[0]\n"
+      "fmla z30.s, z6.s, z5.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[0]\n"
+      "fmla z15.s, z7.s, z1.s[0]\n"
+      "fmla z19.s, z7.s, z2.s[0]\n"
+      "fmla z23.s, z7.s, z3.s[0]\n"
+      "fmla z27.s, z7.s, z4.s[0]\n"
+      "fmla z31.s, z7.s, z5.s[0]\n"
+      "ble 82f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[1]\n"
+      "fmla z20.s, z6.s, z3.s[1]\n"
+      "fmla z24.s, z6.s, z4.s[1]\n"
+      "fmla z28.s, z6.s, z5.s[1]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[1]\n"
+      "fmla z13.s, z7.s, z1.s[1]\n"
+      "fmla z17.s, z7.s, z2.s[1]\n"
+      "fmla z21.s, z7.s, z3.s[1]\n"
+      "fmla z25.s, z7.s, z4.s[1]\n"
+      "fmla z29.s, z7.s, z5.s[1]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[1]\n"
+      "fmla z14.s, z6.s, z1.s[1]\n"
+      "fmla z18.s, z6.s, z2.s[1]\n"
+      "fmla z22.s, z6.s, z3.s[1]\n"
+      "fmla z26.s, z6.s, z4.s[1]\n"
+      "fmla z30.s, z6.s, z5.s[1]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z15.s, z7.s, z1.s[1]\n"
+      "fmla z19.s, z7.s, z2.s[1]\n"
+      "fmla z23.s, z7.s, z3.s[1]\n"
+      "fmla z27.s, z7.s, z4.s[1]\n"
+      "fmla z31.s, z7.s, z5.s[1]\n"
+      "ble 82f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "subs x11, x11, #0x1\n"
+      "fmla z12.s, z6.s, z1.s[2]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z20.s, z6.s, z3.s[2]\n"
+      "fmla z24.s, z6.s, z4.s[2]\n"
+      "fmla z28.s, z6.s, z5.s[2]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[2]\n"
+      "fmla z13.s, z7.s, z1.s[2]\n"
+      "fmla z17.s, z7.s, z2.s[2]\n"
+      "fmla z21.s, z7.s, z3.s[2]\n"
+      "fmla z25.s, z7.s, z4.s[2]\n"
+      "fmla z29.s, z7.s, z5.s[2]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[2]\n"
+      "fmla z14.s, z6.s, z1.s[2]\n"
+      "fmla z18.s, z6.s, z2.s[2]\n"
+      "fmla z22.s, z6.s, z3.s[2]\n"
+      "fmla z26.s, z6.s, z4.s[2]\n"
+      "fmla z30.s, z6.s, z5.s[2]\n"
+      "fmla z11.s, z7.s, z0.s[2]\n"
+      "fmla z15.s, z7.s, z1.s[2]\n"
+      "fmla z19.s, z7.s, z2.s[2]\n"
+      "fmla z23.s, z7.s, z3.s[2]\n"
+      "fmla z27.s, z7.s, z4.s[2]\n"
+      "fmla z31.s, z7.s, z5.s[2]\n"
+      "ble 82f\n"
+      "ld1w { z6.s }, p5/Z, [x15]\n"
+      "fmla z8.s, z6.s, z0.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+      "fmla z12.s, z6.s, z1.s[3]\n"
+      "fmla z16.s, z6.s, z2.s[3]\n"
+      "fmla z20.s, z6.s, z3.s[3]\n"
+      "fmla z24.s, z6.s, z4.s[3]\n"
+      "fmla z28.s, z6.s, z5.s[3]\n"
+      "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+      "fmla z9.s, z7.s, z0.s[3]\n"
+      "fmla z13.s, z7.s, z1.s[3]\n"
+      "fmla z17.s, z7.s, z2.s[3]\n"
+      "fmla z21.s, z7.s, z3.s[3]\n"
+      "fmla z25.s, z7.s, z4.s[3]\n"
+      "fmla z29.s, z7.s, z5.s[3]\n"
+      "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+      "addvl x15, x15, #4\n"
+      "fmla z10.s, z6.s, z0.s[3]\n"
+      "fmla z14.s, z6.s, z1.s[3]\n"
+      "fmla z18.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[3]\n"
+      "fmla z26.s, z6.s, z4.s[3]\n"
+      "fmla z30.s, z6.s, z5.s[3]\n"
+      "fmla z11.s, z7.s, z0.s[3]\n"
+      "fmla z15.s, z7.s, z1.s[3]\n"
+      "fmla z19.s, z7.s, z2.s[3]\n"
+      "fmla z23.s, z7.s, z3.s[3]\n"
+      "fmla z27.s, z7.s, z4.s[3]\n"
+      "fmla z31.s, z7.s, z5.s[3]\n"
+      "82:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 77b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "fmax z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmin z28.s, p5/M, z28.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z1.s\n"
+      "fmax z26.s, p5/M, z26.s, z1.s\n"
+      "fmax z27.s, p5/M, z27.s, z1.s\n"
+      "fmax z28.s, p5/M, z28.s, z1.s\n"
+      "fmin z29.s, p5/M, z29.s, z0.s\n"
+      "fmin z30.s, p5/M, z30.s, z0.s\n"
+      "fmin z31.s, p5/M, z31.s, z0.s\n"
+      "fmax z29.s, p5/M, z29.s, z1.s\n"
+      "fmax z30.s, p5/M, z30.s, z1.s\n"
+      "fmax z31.s, p5/M, z31.s, z1.s\n"
+      "83:"  // Height 6: No activation
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "st1w { z28.s }, p4, [x21]\n"
+      "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+      "addvl x21, x21, #4\n"
+      "84:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x16, x16, x19\n"
+      "bgt 73b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 86f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "85:"  // Update direct input
+      "mov x19, #0x18\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "86:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
similarity index 63%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
index fd416ed2f4..20d9922e93 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,37 +10,43 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __ARM_FEATURE_SVE
 
-
 #include "../std_transforms_sve.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<float>, \
+   size_t, size_t, \
+   const float *, \
+   IndirectOutputArg<float>, \
+   const float *, Activation, bool
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_hybrid_fp32_mmla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_hybrid_fp32_mla_8x1VL( ARGLIST );
 
-class hybrid_fp32_mmla_4VLx4
+class cls_sve_hybrid_fp32_mla_8x1VL
 {
 public:
     typedef float operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
@@ -50,12 +56,12 @@ class hybrid_fp32_mmla_4VLx4
 
     static unsigned int out_width()
     {
-        return get_vector_length<float>() * 2;
+        return get_vector_length<float>() * 1;
     }
 
     static constexpr unsigned int k_unroll()
     {
-        return 2;
+        return 1;
     }
 
     static constexpr bool supports_accumulate()
@@ -63,27 +69,17 @@ class hybrid_fp32_mmla_4VLx4
         return true;
     }
 
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 4, 4, 2> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 1, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_hybrid_fp32_mmla_4VLx4;
+    kern_type kernel=sve_hybrid_fp32_mla_8x1VL;
 
-    hybrid_fp32_mmla_4VLx4(const CPUInfo *)
+    cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
new file mode 100644
index 0000000000..361e303c7a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -0,0 +1,1751 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_8x1VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x8\n"
+      "bge 99f\n"
+      "cmp %x[M], #0x6\n"
+      "bgt 85f\n"
+      "beq 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 4f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "addvl x8, x8, #1\n"
+      "b 6f\n"
+      "4:"  // Height 1: no bias
+      "tbz %x[flags], #0, 5f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "b 6f\n"
+      "5:"  // Height 1: no accumulate
+      "mov z24.b, #0x0\n"
+      "6:"  // Height 1: setup done
+      "mov x16, #0x0\n"
+      "7:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 8f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "cbnz x16, 9f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "b 9f\n"
+      "8:"  // Height 1: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "9:"  // Height 1: input setup done
+      "cmp x15, #0x4\n"
+      "ble 11f\n"
+      "10:"  // Height 1: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "bgt 10b\n"
+      "11:"  // Height 1: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "add x14, x14, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 12f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "addvl x7, x7, #1\n"
+      "ble 12f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "addvl x7, x7, #1\n"
+      "ble 12f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "12:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 7b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "tbz %x[flags], #1, 13f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "13:"  // Height 1: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "14:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 3b\n"
+      "b 114f\n"
+      "15:"  // Height 2
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 16f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "17:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 18f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "b 20f\n"
+      "18:"  // Height 2: no bias
+      "tbz %x[flags], #0, 19f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "b 20f\n"
+      "19:"  // Height 2: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "20:"  // Height 2: setup done
+      "mov x16, #0x0\n"
+      "21:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 22f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "cbnz x16, 23f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "b 23f\n"
+      "22:"  // Height 2: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "23:"  // Height 2: input setup done
+      "cmp x15, #0x4\n"
+      "ble 25f\n"
+      "24:"  // Height 2: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "bgt 24b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "add x12, x12, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 26f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "ble 26f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "ble 26f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 21b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "27:"  // Height 2: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "28:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 17b\n"
+      "b 114f\n"
+      "29:"  // Height 3
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 30f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "add x11, x11, x19, LSL #2\n"
+      "b 31f\n"
+      "30:"  // Height 3: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "31:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 32f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "mov z26.d, z24.d\n"
+      "b 34f\n"
+      "32:"  // Height 3: no bias
+      "tbz %x[flags], #0, 33f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "ld1w { z26.s }, p1/Z, [x11]\n"
+      "b 34f\n"
+      "33:"  // Height 3: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x16, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "cbnz x16, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "37:"  // Height 3: input setup done
+      "cmp x15, #0x4\n"
+      "ble 39f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z26.s, z9.s, z2.s[1]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z26.s, z11.s, z2.s[3]\n"
+      "bgt 38b\n"
+      "39:"  // Height 3: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z26.s, z12.s, z2.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 40f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z13.s, z2.s[1]\n"
+      "ble 40f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z14.s, z2.s[2]\n"
+      "ble 40f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "fmla z26.s, z15.s, z2.s[3]\n"
+      "40:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 35b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "tbz %x[flags], #1, 41f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "41:"  // Height 3: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "st1w { z26.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "42:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 31b\n"
+      "b 114f\n"
+      "43:"  // Height 4
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 44f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 45f\n"
+      "44:"  // Height 4: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "45:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 46f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "mov z26.d, z24.d\n"
+      "mov z27.d, z24.d\n"
+      "b 48f\n"
+      "46:"  // Height 4: no bias
+      "tbz %x[flags], #0, 47f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "ld1w { z26.s }, p1/Z, [x11]\n"
+      "ld1w { z27.s }, p1/Z, [x9]\n"
+      "b 48f\n"
+      "47:"  // Height 4: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "48:"  // Height 4: setup done
+      "mov x16, #0x0\n"
+      "49:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "cbnz x16, 51f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "b 51f\n"
+      "50:"  // Height 4: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "51:"  // Height 4: input setup done
+      "cmp x15, #0x4\n"
+      "ble 53f\n"
+      "52:"  // Height 4: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z26.s, z9.s, z2.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z27.s, z9.s, z3.s[1]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z27.s, z11.s, z3.s[3]\n"
+      "bgt 52b\n"
+      "53:"  // Height 4: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z26.s, z12.s, z2.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z27.s, z12.s, z3.s[0]\n"
+      "add x28, x28, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 54f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z13.s, z2.s[1]\n"
+      "fmla z27.s, z13.s, z3.s[1]\n"
+      "ble 54f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z14.s, z2.s[2]\n"
+      "fmla z27.s, z14.s, z3.s[2]\n"
+      "ble 54f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "fmla z26.s, z15.s, z2.s[3]\n"
+      "fmla z27.s, z15.s, z3.s[3]\n"
+      "54:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 49b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbz %x[flags], #1, 55f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "55:"  // Height 4: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "st1w { z26.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z27.s }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "56:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 45b\n"
+      "b 114f\n"
+      "57:"  // Height 5
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 58f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 59f\n"
+      "58:"  // Height 5: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "59:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 60f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "mov z26.d, z24.d\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "b 62f\n"
+      "60:"  // Height 5: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "ld1w { z26.s }, p1/Z, [x11]\n"
+      "ld1w { z27.s }, p1/Z, [x9]\n"
+      "ld1w { z28.s }, p1/Z, [x27]\n"
+      "b 62f\n"
+      "61:"  // Height 5: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "62:"  // Height 5: setup done
+      "mov x16, #0x0\n"
+      "63:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "cbnz x16, 65f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "b 65f\n"
+      "64:"  // Height 5: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "65:"  // Height 5: input setup done
+      "cmp x15, #0x4\n"
+      "ble 67f\n"
+      "66:"  // Height 5: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z28.s, z8.s, z4.s[0]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z26.s, z9.s, z2.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z27.s, z9.s, z3.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z28.s, z9.s, z4.s[1]\n"
+      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z28.s, z10.s, z4.s[2]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z27.s, z11.s, z3.s[3]\n"
+      "fmla z28.s, z11.s, z4.s[3]\n"
+      "bgt 66b\n"
+      "67:"  // Height 5: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z26.s, z12.s, z2.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z27.s, z12.s, z3.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z28.s, z12.s, z4.s[0]\n"
+      "add x26, x26, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 68f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z13.s, z2.s[1]\n"
+      "fmla z27.s, z13.s, z3.s[1]\n"
+      "fmla z28.s, z13.s, z4.s[1]\n"
+      "ble 68f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z14.s, z2.s[2]\n"
+      "fmla z27.s, z14.s, z3.s[2]\n"
+      "fmla z28.s, z14.s, z4.s[2]\n"
+      "ble 68f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "fmla z26.s, z15.s, z2.s[3]\n"
+      "fmla z27.s, z15.s, z3.s[3]\n"
+      "fmla z28.s, z15.s, z4.s[3]\n"
+      "68:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 63b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "tbz %x[flags], #1, 69f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "69:"  // Height 5: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "st1w { z26.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z27.s }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "st1w { z28.s }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "70:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 59b\n"
+      "b 114f\n"
+      "71:"  // Height 6
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x28]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "73:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 74f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "mov z26.d, z24.d\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "mov z29.d, z24.d\n"
+      "b 76f\n"
+      "74:"  // Height 6: no bias
+      "tbz %x[flags], #0, 75f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "ld1w { z26.s }, p1/Z, [x11]\n"
+      "ld1w { z27.s }, p1/Z, [x9]\n"
+      "ld1w { z28.s }, p1/Z, [x27]\n"
+      "ld1w { z29.s }, p1/Z, [x25]\n"
+      "b 76f\n"
+      "75:"  // Height 6: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "76:"  // Height 6: setup done
+      "mov x16, #0x0\n"
+      "77:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 78f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "ldr x24, [x20, #0x28]\n"
+      "cbnz x16, 79f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 79f\n"
+      "78:"  // Height 6: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "79:"  // Height 6: input setup done
+      "cmp x15, #0x4\n"
+      "ble 81f\n"
+      "80:"  // Height 6: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z28.s, z8.s, z4.s[0]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z29.s, z8.s, z5.s[0]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z26.s, z9.s, z2.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla z27.s, z9.s, z3.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z28.s, z9.s, z4.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z29.s, z9.s, z5.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z28.s, z10.s, z4.s[2]\n"
+      "fmla z29.s, z10.s, z5.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z27.s, z11.s, z3.s[3]\n"
+      "fmla z28.s, z11.s, z4.s[3]\n"
+      "fmla z29.s, z11.s, z5.s[3]\n"
+      "bgt 80b\n"
+      "81:"  // Height 6: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z26.s, z12.s, z2.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z27.s, z12.s, z3.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z28.s, z12.s, z4.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z29.s, z12.s, z5.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 82f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z13.s, z2.s[1]\n"
+      "fmla z27.s, z13.s, z3.s[1]\n"
+      "fmla z28.s, z13.s, z4.s[1]\n"
+      "fmla z29.s, z13.s, z5.s[1]\n"
+      "ble 82f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z14.s, z2.s[2]\n"
+      "fmla z27.s, z14.s, z3.s[2]\n"
+      "fmla z28.s, z14.s, z4.s[2]\n"
+      "fmla z29.s, z14.s, z5.s[2]\n"
+      "ble 82f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "fmla z26.s, z15.s, z2.s[3]\n"
+      "fmla z27.s, z15.s, z3.s[3]\n"
+      "fmla z28.s, z15.s, z4.s[3]\n"
+      "fmla z29.s, z15.s, z5.s[3]\n"
+      "82:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 77b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "83:"  // Height 6: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "st1w { z26.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z27.s }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "st1w { z28.s }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "st1w { z29.s }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "84:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 73b\n"
+      "b 114f\n"
+      "85:"  // Height 7
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 86f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x28]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x23, [%x[output_ptr], #0x30]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 87f\n"
+      "86:"  // Height 7: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "87:"  // Height 7: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 88f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "mov z26.d, z24.d\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "mov z29.d, z24.d\n"
+      "mov z30.d, z24.d\n"
+      "b 90f\n"
+      "88:"  // Height 7: no bias
+      "tbz %x[flags], #0, 89f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "ld1w { z26.s }, p1/Z, [x11]\n"
+      "ld1w { z27.s }, p1/Z, [x9]\n"
+      "ld1w { z28.s }, p1/Z, [x27]\n"
+      "ld1w { z29.s }, p1/Z, [x25]\n"
+      "ld1w { z30.s }, p1/Z, [x23]\n"
+      "b 90f\n"
+      "89:"  // Height 7: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "90:"  // Height 7: setup done
+      "mov x16, #0x0\n"
+      "91:"  // Height 7: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 92f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "ldr x24, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "cbnz x16, 93f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 93f\n"
+      "92:"  // Height 7: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "93:"  // Height 7: input setup done
+      "cmp x15, #0x4\n"
+      "ble 95f\n"
+      "94:"  // Height 7: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1rqw { z6.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z28.s, z8.s, z4.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z29.s, z8.s, z5.s[0]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z30.s, z8.s, z6.s[0]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z26.s, z9.s, z2.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla z27.s, z9.s, z3.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z28.s, z9.s, z4.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z29.s, z9.s, z5.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z30.s, z9.s, z6.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z28.s, z10.s, z4.s[2]\n"
+      "fmla z29.s, z10.s, z5.s[2]\n"
+      "fmla z30.s, z10.s, z6.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z27.s, z11.s, z3.s[3]\n"
+      "fmla z28.s, z11.s, z4.s[3]\n"
+      "fmla z29.s, z11.s, z5.s[3]\n"
+      "fmla z30.s, z11.s, z6.s[3]\n"
+      "bgt 94b\n"
+      "95:"  // Height 7: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z26.s, z12.s, z2.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z27.s, z12.s, z3.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z28.s, z12.s, z4.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z29.s, z12.s, z5.s[0]\n"
+      "ld1rqw { z6.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z30.s, z12.s, z6.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 96f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z13.s, z2.s[1]\n"
+      "fmla z27.s, z13.s, z3.s[1]\n"
+      "fmla z28.s, z13.s, z4.s[1]\n"
+      "fmla z29.s, z13.s, z5.s[1]\n"
+      "fmla z30.s, z13.s, z6.s[1]\n"
+      "ble 96f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z14.s, z2.s[2]\n"
+      "fmla z27.s, z14.s, z3.s[2]\n"
+      "fmla z28.s, z14.s, z4.s[2]\n"
+      "fmla z29.s, z14.s, z5.s[2]\n"
+      "fmla z30.s, z14.s, z6.s[2]\n"
+      "ble 96f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "fmla z26.s, z15.s, z2.s[3]\n"
+      "fmla z27.s, z15.s, z3.s[3]\n"
+      "fmla z28.s, z15.s, z4.s[3]\n"
+      "fmla z29.s, z15.s, z5.s[3]\n"
+      "fmla z30.s, z15.s, z6.s[3]\n"
+      "96:"  // Height 7: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 91b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 97f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z16.s\n"
+      "fmin z30.s, p2/M, z30.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "fmax z30.s, p2/M, z30.s, z17.s\n"
+      "97:"  // Height 7: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "st1w { z26.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z27.s }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "st1w { z28.s }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "st1w { z29.s }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "st1w { z30.s }, p1, [x23]\n"
+      "addvl x23, x23, #1\n"
+      "98:"  // Height 7: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 87b\n"
+      "b 114f\n"
+      "99:"  // Height 8
+      "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x8, %x[bias]\n"
+      "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 100f\n"
+      "ldr x17, [%x[output_ptr], #0x0]\n"
+      "add x17, x17, x19, LSL #2\n"
+      "ldr x13, [%x[output_ptr], #0x8]\n"
+      "ldr x11, [%x[output_ptr], #0x10]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x18]\n"
+      "ldr x27, [%x[output_ptr], #0x20]\n"
+      "add x11, x11, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x28]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x23, [%x[output_ptr], #0x30]\n"
+      "ldr x21, [%x[output_ptr], #0x38]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add %x[output_ptr], %x[output_ptr], #0x40\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 101f\n"
+      "100:"  // Height 8: setup direct output
+      "mov x17, %x[output_ptr]\n"
+      "add x13, x17, x19, LSL #2\n"
+      "add x11, x13, x19, LSL #2\n"
+      "add x9, x11, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "101:"  // Height 8: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p1.s, x19, x6\n"
+      "cbz x8, 102f\n"
+      "ld1w { z24.s }, p2/Z, [x8]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x8, x8, #1\n"
+      "mov z26.d, z24.d\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "mov z29.d, z24.d\n"
+      "mov z30.d, z24.d\n"
+      "mov z31.d, z24.d\n"
+      "b 104f\n"
+      "102:"  // Height 8: no bias
+      "tbz %x[flags], #0, 103f\n"
+      "ld1w { z24.s }, p1/Z, [x17]\n"
+      "ld1w { z25.s }, p1/Z, [x13]\n"
+      "ld1w { z26.s }, p1/Z, [x11]\n"
+      "ld1w { z27.s }, p1/Z, [x9]\n"
+      "ld1w { z28.s }, p1/Z, [x27]\n"
+      "ld1w { z29.s }, p1/Z, [x25]\n"
+      "ld1w { z30.s }, p1/Z, [x23]\n"
+      "ld1w { z31.s }, p1/Z, [x21]\n"
+      "b 104f\n"
+      "103:"  // Height 8: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "104:"  // Height 8: setup done
+      "mov x16, #0x0\n"
+      "105:"  // Height 8: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w15, [x20, x16, LSL #0x2]\n"
+      "tbz %x[flags], #3, 106f\n"
+      "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x14, [x20, #0x0]\n"
+      "ldr x12, [x20, #0x8]\n"
+      "ldr x10, [x20, #0x10]\n"
+      "ldr x28, [x20, #0x18]\n"
+      "ldr x26, [x20, #0x20]\n"
+      "ldr x24, [x20, #0x28]\n"
+      "ldr x22, [x20, #0x30]\n"
+      "ldr x20, [x20, #0x38]\n"
+      "cbnz x16, 107f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x14, x14, x19, LSL #2\n"
+      "add x12, x12, x19, LSL #2\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x28, x28, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 107f\n"
+      "106:"  // Height 8: setup direct input
+      "mov x14, %x[input_ptr]\n"
+      "add x12, x14, x19, LSL #2\n"
+      "add x10, x12, x19, LSL #2\n"
+      "add x28, x10, x19, LSL #2\n"
+      "add x26, x28, x19, LSL #2\n"
+      "add x24, x26, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "add x20, x22, x19, LSL #2\n"
+      "107:"  // Height 8: input setup done
+      "cmp x15, #0x4\n"
+      "ble 109f\n"
+      "108:"  // Height 8: Multiply loop: Main loop head
+      "ld1w { z8.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+      "sub x15, x15, #0x4\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z8.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z24.s, z9.s, z0.s[1]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
+      "ld1rqw { z6.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z28.s, z8.s, z4.s[0]\n"
+      "ld1rqw { z7.s }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z29.s, z8.s, z5.s[0]\n"
+      "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "fmla z30.s, z8.s, z6.s[0]\n"
+      "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+      "cmp x15, #0x4\n"
+      "fmla z31.s, z8.s, z7.s[0]\n"
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "addvl x7, x7, #4\n"
+      "fmla z26.s, z9.s, z2.s[1]\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "fmla z27.s, z9.s, z3.s[1]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla z28.s, z9.s, z4.s[1]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "fmla z29.s, z9.s, z5.s[1]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "fmla z30.s, z9.s, z6.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla z31.s, z9.s, z7.s[1]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla z24.s, z10.s, z0.s[2]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla z25.s, z10.s, z1.s[2]\n"
+      "fmla z26.s, z10.s, z2.s[2]\n"
+      "fmla z27.s, z10.s, z3.s[2]\n"
+      "fmla z28.s, z10.s, z4.s[2]\n"
+      "fmla z29.s, z10.s, z5.s[2]\n"
+      "fmla z30.s, z10.s, z6.s[2]\n"
+      "fmla z31.s, z10.s, z7.s[2]\n"
+      "fmla z24.s, z11.s, z0.s[3]\n"
+      "fmla z25.s, z11.s, z1.s[3]\n"
+      "fmla z26.s, z11.s, z2.s[3]\n"
+      "fmla z27.s, z11.s, z3.s[3]\n"
+      "fmla z28.s, z11.s, z4.s[3]\n"
+      "fmla z29.s, z11.s, z5.s[3]\n"
+      "fmla z30.s, z11.s, z6.s[3]\n"
+      "fmla z31.s, z11.s, z7.s[3]\n"
+      "bgt 108b\n"
+      "109:"  // Height 8: Multiply loop: Single iteration only
+      "ld1w { z12.s }, p2/Z, [x7]\n"
+      "whilelt p0.s, XZR, x15\n"
+      "subs x15, x15, #0x1\n"
+      "ld1rqw { z0.s }, p0/Z, [x14]\n"
+      "fmla z24.s, z12.s, z0.s[0]\n"
+      "ld1rqw { z1.s }, p0/Z, [x12]\n"
+      "add x14, x14, #0x10\n"
+      "fmla z25.s, z12.s, z1.s[0]\n"
+      "ld1rqw { z2.s }, p0/Z, [x10]\n"
+      "add x12, x12, #0x10\n"
+      "fmla z26.s, z12.s, z2.s[0]\n"
+      "ld1rqw { z3.s }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "fmla z27.s, z12.s, z3.s[0]\n"
+      "ld1rqw { z4.s }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "fmla z28.s, z12.s, z4.s[0]\n"
+      "ld1rqw { z5.s }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "fmla z29.s, z12.s, z5.s[0]\n"
+      "ld1rqw { z6.s }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z30.s, z12.s, z6.s[0]\n"
+      "ld1rqw { z7.s }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z31.s, z12.s, z7.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "addvl x7, x7, #1\n"
+      "ble 110f\n"
+      "ld1w { z13.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z13.s, z0.s[1]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z13.s, z1.s[1]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z13.s, z2.s[1]\n"
+      "fmla z27.s, z13.s, z3.s[1]\n"
+      "fmla z28.s, z13.s, z4.s[1]\n"
+      "fmla z29.s, z13.s, z5.s[1]\n"
+      "fmla z30.s, z13.s, z6.s[1]\n"
+      "fmla z31.s, z13.s, z7.s[1]\n"
+      "ble 110f\n"
+      "ld1w { z14.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z14.s, z0.s[2]\n"
+      "subs x15, x15, #0x1\n"
+      "fmla z25.s, z14.s, z1.s[2]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z26.s, z14.s, z2.s[2]\n"
+      "fmla z27.s, z14.s, z3.s[2]\n"
+      "fmla z28.s, z14.s, z4.s[2]\n"
+      "fmla z29.s, z14.s, z5.s[2]\n"
+      "fmla z30.s, z14.s, z6.s[2]\n"
+      "fmla z31.s, z14.s, z7.s[2]\n"
+      "ble 110f\n"
+      "ld1w { z15.s }, p2/Z, [x7]\n"
+      "fmla z24.s, z15.s, z0.s[3]\n"
+      "addvl x7, x7, #1\n"
+      "fmla z25.s, z15.s, z1.s[3]\n"
+      "fmla z26.s, z15.s, z2.s[3]\n"
+      "fmla z27.s, z15.s, z3.s[3]\n"
+      "fmla z28.s, z15.s, z4.s[3]\n"
+      "fmla z29.s, z15.s, z5.s[3]\n"
+      "fmla z30.s, z15.s, z6.s[3]\n"
+      "fmla z31.s, z15.s, z7.s[3]\n"
+      "110:"  // Height 8: Multiply loop: multiply skip
+      "prfm pldl1keep, [x14, #0x80]\n"
+      "add x16, x16, #0x1\n"
+      "prfm pldl1keep, [x12, #0x80]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x16, x19\n"
+      "bne 105b\n"
+      "prfm pstl1keep, [x17, #0x0]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x11, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 111f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z17.s }, p2/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z16.s }, p2/Z, [x19]\n"
+      "fmin z24.s, p2/M, z24.s, z16.s\n"
+      "fmin z25.s, p2/M, z25.s, z16.s\n"
+      "fmin z26.s, p2/M, z26.s, z16.s\n"
+      "fmin z27.s, p2/M, z27.s, z16.s\n"
+      "fmin z28.s, p2/M, z28.s, z16.s\n"
+      "fmax z24.s, p2/M, z24.s, z17.s\n"
+      "fmax z25.s, p2/M, z25.s, z17.s\n"
+      "fmax z26.s, p2/M, z26.s, z17.s\n"
+      "fmax z27.s, p2/M, z27.s, z17.s\n"
+      "fmax z28.s, p2/M, z28.s, z17.s\n"
+      "fmin z29.s, p2/M, z29.s, z16.s\n"
+      "fmin z30.s, p2/M, z30.s, z16.s\n"
+      "fmin z31.s, p2/M, z31.s, z16.s\n"
+      "fmax z29.s, p2/M, z29.s, z17.s\n"
+      "fmax z30.s, p2/M, z30.s, z17.s\n"
+      "fmax z31.s, p2/M, z31.s, z17.s\n"
+      "111:"  // Height 8: No activation
+      "st1w { z24.s }, p1, [x17]\n"
+      "addvl x17, x17, #1\n"
+      "st1w { z25.s }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "st1w { z26.s }, p1, [x11]\n"
+      "addvl x11, x11, #1\n"
+      "st1w { z27.s }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "st1w { z28.s }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "st1w { z29.s }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "st1w { z30.s }, p1, [x23]\n"
+      "addvl x23, x23, #1\n"
+      "st1w { z31.s }, p1, [x21]\n"
+      "addvl x21, x21, #1\n"
+      "112:"  // Height 8: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19\n"
+      "subs x6, x6, x19\n"
+      "bgt 101b\n"
+      "subs %x[M], %x[M], #0x8\n"
+      "beq 114f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 113f\n"
+      "add x20, x20, #0x8\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "113:"  // Update direct input
+      "mov x19, #0x20\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "114:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp
deleted file mode 100644
index 1364585604..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,3459 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_fp32_mmla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
-    const int K_stride = ((K + 1) / 2) * 2;
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long leftovers = K;
-    const long blocks_count = (K + 1) / 2;
-    float nullbias[128];
-    if (!accumulate && !bias) {
-        memset(nullbias, 0, (2 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 8) {
-            if (rows_to_compute % 8) {
-                rows_to_compute = 8 - 1;
-            } else {
-                rows_to_compute = 8;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z1.s, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z14.s, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "mov z1.s, #0\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z5.s, #0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "mov z1.s, #0\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z5.s, #0\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "mov z1.s, #0\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z5.s, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp1 z1.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z3.s, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z20.d, z16.d\n"
-                        "mov z21.d, z17.d\n"
-                        "mov z22.d, z18.d\n"
-                        "mov z23.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z3.s, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z7.s, #0\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z3.s, #0\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z7.s, #0\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z3.s, #0\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z7.s, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp1 z5.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z21.d, z17.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z22.d, z18.d\n"
-                        "mov z23.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-                case 5:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "c_ptr1 .req X4\n"
-                        "c_ptr2 .req X5\n"
-                        "c_ptr3 .req X6\n"
-                        "c_ptr4 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z5.s, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z21.d, z17.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z23.d, z19.d\n"
-                        "mov z24.d, z16.d\n"
-                        "mov z25.d, z17.d\n"
-                        "mov z26.d, z18.d\n"
-                        "mov z27.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z5.s, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z9.s, #0\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z5.s, #0\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z9.s, #0\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z5.s, #0\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p6/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z9.s, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp1 z9.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
-                    );
-                    break;
-                case 6:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "c_ptr1 .req X5\n"
-                        "c_ptr2 .req X6\n"
-                        "c_ptr3 .req X7\n"
-                        "c_ptr4 .req X8\n"
-                        "c_ptr5 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z21.d, z17.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "mov z24.d, z16.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z25.d, z17.d\n"
-                        "mov z26.d, z18.d\n"
-                        "mov z27.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "ld1w z14.s, p0/z, [c_ptr5]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl a_ptr5, a_ptr5, #2\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p6/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p6/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "addvl a_ptr5, a_ptr5, #1\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "st1w z9.s, p0, [c_ptr5]\n"
-                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
-                    );
-                    break;
-                case 7:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "a_ptr6 .req X5\n"
-                        "c_ptr1 .req X6\n"
-                        "c_ptr2 .req X7\n"
-                        "c_ptr3 .req X8\n"
-                        "c_ptr4 .req X9\n"
-                        "c_ptr5 .req X10\n"
-                        "c_ptr6 .req X11\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add a_ptr6, a_ptr5, %[lda]\n"
-                        "add c_ptr6, c_ptr5, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z7.s, #0\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z21.d, z17.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "mov z24.d, z16.d\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "mov z25.d, z17.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z26.d, z18.d\n"
-                        "mov z27.d, z19.d\n"
-                        "mov z28.d, z16.d\n"
-                        "mov z29.d, z17.d\n"
-                        "mov z30.d, z18.d\n"
-                        "mov z31.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "mov z7.s, #0\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "ld1w z14.s, p0/z, [c_ptr5]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr6]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z28.s, z13.s, z14.s\n"
-                        "zip2 z29.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
-                        "mov z14.s, #0\n"
-                        "zip1 z30.s, z13.s, z14.s\n"
-                        "zip2 z31.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1rqw z10.s, p7/z, [a_ptr6]\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z11.s, #0\n"
-                        "add a_ptr6, a_ptr6, #0x20\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z7.s, #0\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1rqw z10.s, p7/z, [a_ptr6]\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z11.s, #0\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl a_ptr5, a_ptr5, #2\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "addvl a_ptr6, a_ptr6, #2\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "mov z7.s, #0\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p6/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p6/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "addvl a_ptr5, a_ptr5, #1\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1rqw z10.s, p6/z, [a_ptr6]\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z11.s, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "addvl a_ptr6, a_ptr6, #1\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "st1w z9.s, p0, [c_ptr5]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "uzp1 z12.s, z28.s, z29.s\n"
-                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
-                        "uzp1 z13.s, z30.s, z31.s\n"
-                        "st1w z12.s, p0, [c_ptr6]\n"
-                        "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq a_ptr6\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        ".unreq c_ptr6\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 8:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "a_ptr4 .req X3\n"
-                        "a_ptr5 .req X4\n"
-                        "a_ptr6 .req X5\n"
-                        "a_ptr7 .req X6\n"
-                        "c_ptr1 .req X7\n"
-                        "c_ptr2 .req X8\n"
-                        "c_ptr3 .req X9\n"
-                        "c_ptr4 .req X10\n"
-                        "c_ptr5 .req X11\n"
-                        "c_ptr6 .req X12\n"
-                        "c_ptr7 .req X13\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr4, a_ptr3, %[lda]\n"
-                        "add c_ptr4, c_ptr3, %[ldc]\n"
-                        "add a_ptr5, a_ptr4, %[lda]\n"
-                        "add c_ptr5, c_ptr4, %[ldc]\n"
-                        "add a_ptr6, a_ptr5, %[lda]\n"
-                        "add c_ptr6, c_ptr5, %[ldc]\n"
-                        "add a_ptr7, a_ptr6, %[lda]\n"
-                        "add c_ptr7, c_ptr6, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z15.s, z15.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z15.s, z15.s\n"
-                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip1 z18.s, z15.s, z15.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z15.s, z15.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "mov z24.d, z16.d\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "mov z25.d, z17.d\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "mov z26.d, z18.d\n"
-                        "add a_ptr7, a_ptr7, #0x10\n"
-                        "mov z27.d, z19.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "mov z28.d, z16.d\n"
-                        "mov z29.d, z17.d\n"
-                        "mov z30.d, z18.d\n"
-                        "mov z31.d, z19.d\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z14.s, p0/z, [c_ptr1]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z16.s, z13.s, z14.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "zip2 z17.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "zip1 z18.s, z13.s, z14.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
-                        "zip2 z19.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr2]\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        "ld1w z14.s, p0/z, [c_ptr3]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr6]\n"
-                        "add a_ptr4, a_ptr4, #0x10\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr7]\n"
-                        "zip1 z20.s, z13.s, z14.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "zip2 z21.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "add a_ptr5, a_ptr5, #0x10\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr6, a_ptr6, #0x10\n"
-                        "zip1 z22.s, z13.s, z14.s\n"
-                        "add a_ptr7, a_ptr7, #0x10\n"
-                        "zip2 z23.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr4]\n"
-                        "ld1w z14.s, p0/z, [c_ptr5]\n"
-                        "zip1 z24.s, z13.s, z14.s\n"
-                        "zip2 z25.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
-                        "zip1 z26.s, z13.s, z14.s\n"
-                        "zip2 z27.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p0/z, [c_ptr6]\n"
-                        "ld1w z14.s, p0/z, [c_ptr7]\n"
-                        "zip1 z28.s, z13.s, z14.s\n"
-                        "zip2 z29.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
-                        "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n"
-                        "zip1 z30.s, z13.s, z14.s\n"
-                        "zip2 z31.s, z13.s, z14.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "add a_ptr4, a_ptr4, #0x20\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "add a_ptr5, a_ptr5, #0x20\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1rqw z10.s, p7/z, [a_ptr6]\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z11.s, p7/z, [a_ptr7]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "add a_ptr6, a_ptr6, #0x20\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "add a_ptr7, a_ptr7, #0x20\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr7, #-0x10]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1rqw z10.s, p7/z, [a_ptr6]\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z11.s, p7/z, [a_ptr7]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl a_ptr4, a_ptr4, #2\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        "addvl a_ptr5, a_ptr5, #2\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        "addvl a_ptr6, a_ptr6, #2\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        "addvl a_ptr7, a_ptr7, #2\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "trn1 z8.d, z0.d, z1.d\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "trn1 z9.d, z2.d, z3.d\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "trn1 z10.d, z4.d, z5.d\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "trn1 z11.d, z6.d, z7.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "trn2 z0.d, z0.d, z1.d\n"
-                        "trn2 z1.d, z2.d, z3.d\n"
-                        "trn2 z2.d, z4.d, z5.d\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "trn2 z3.d, z6.d, z7.d\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        "ld1rqw z8.s, p6/z, [a_ptr4]\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        "ld1rqw z9.s, p6/z, [a_ptr5]\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        "addvl a_ptr4, a_ptr4, #1\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        "addvl a_ptr5, a_ptr5, #1\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        "ld1rqw z10.s, p6/z, [a_ptr6]\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "ld1rqw z11.s, p6/z, [a_ptr7]\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        "addvl a_ptr6, a_ptr6, #1\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        "addvl a_ptr7, a_ptr7, #1\n"
-                        "trn1 z0.d, z4.d, z5.d\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        "trn1 z1.d, z6.d, z7.d\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        "trn1 z2.d, z8.d, z9.d\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "trn1 z3.d, z10.d, z11.d\n"
-                        "cbz %[blocks], 5f\n"
-                        "trn2 z11.d, z10.d, z11.d\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
-                        "trn2 z10.d, z8.d, z9.d\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "trn2 z9.d, z6.d, z7.d\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "trn2 z8.d, z4.d, z5.d\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
-                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
-                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
-                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
-                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
-                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
-                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
-                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
-                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
-                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
-                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
-                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
-                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
-                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
-                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
-                        "b.eq 5f\n"
-                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
-                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
-                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
-                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
-                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
-                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
-                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
-                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
-                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
-                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
-                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
-                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
-                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
-                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
-                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
-                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
-                        "5:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "uzp1 z0.s, z16.s, z17.s\n"
-                        "uzp2 z1.s, z16.s, z17.s\n"
-                        "uzp1 z2.s, z18.s, z19.s\n"
-                        "uzp2 z3.s, z18.s, z19.s\n"
-                        "st1w z0.s, p0, [%[c_ptr0]]\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z1.s, p0, [c_ptr1]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "uzp1 z4.s, z20.s, z21.s\n"
-                        "uzp2 z5.s, z20.s, z21.s\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "uzp1 z6.s, z22.s, z23.s\n"
-                        "st1w z4.s, p0, [c_ptr2]\n"
-                        "uzp2 z7.s, z22.s, z23.s\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z5.s, p0, [c_ptr3]\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "uzp1 z8.s, z24.s, z25.s\n"
-                        "uzp2 z9.s, z24.s, z25.s\n"
-                        "uzp1 z10.s, z26.s, z27.s\n"
-                        "uzp2 z11.s, z26.s, z27.s\n"
-                        "st1w z8.s, p0, [c_ptr4]\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "st1w z9.s, p0, [c_ptr5]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "uzp1 z12.s, z28.s, z29.s\n"
-                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
-                        "uzp2 z13.s, z28.s, z29.s\n"
-                        "uzp1 z14.s, z30.s, z31.s\n"
-                        "uzp2 z15.s, z30.s, z31.s\n"
-                        "st1w z12.s, p0, [c_ptr6]\n"
-                        "st1w z13.s, p0, [c_ptr7]\n"
-                        "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n"
-                        "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq a_ptr4\n"
-                        ".unreq a_ptr5\n"
-                        ".unreq a_ptr6\n"
-                        ".unreq a_ptr7\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq c_ptr4\n"
-                        ".unreq c_ptr5\n"
-                        ".unreq c_ptr6\n"
-                        ".unreq c_ptr7\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
similarity index 66%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
index c500f43fe0..0150ce8fd9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,37 +10,43 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __ARM_FEATURE_SVE
 
-#include <cstdint>
 #include "../std_transforms_sve.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<int8_t>, \
+   size_t, size_t, \
+   const int8_t *, \
+   IndirectOutputArg<int8_t>, \
+   const Requantize32 *, const int32_t *, unsigned int
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_hybrid_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void sve_hybrid_s8qa_dot_4x4VL( ARGLIST );
 
-class hybrid_s8s32_dot_4VLx4
+class cls_sve_hybrid_s8qa_dot_4x4VL
 {
 public:
     typedef int8_t operand_type;
-    typedef int32_t result_type;
+    typedef int8_t result_type;
 
-    typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
@@ -59,16 +65,6 @@ class hybrid_s8s32_dot_4VLx4
     }
 
     static constexpr bool supports_accumulate()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_activation()
     {
         return false;
     }
@@ -76,14 +72,14 @@ class hybrid_s8s32_dot_4VLx4
     StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_hybrid_s8s32_dot_4VLx4;
+    kern_type kernel=sve_hybrid_s8qa_dot_4x4VL;
 
-    hybrid_s8s32_dot_4VLx4(const CPUInfo *)
+    cls_sve_hybrid_s8qa_dot_4x4VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
new file mode 100644
index 0000000000..2b1448bd65
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
@@ -0,0 +1,1602 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8qa_dot_4x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 46f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 31f\n"
+      "beq 16f\n"
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "add x9, x9, x19\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "ble 10f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z4.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z5.b, z0.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "sdot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "sdot z19.s, z7.b, z0.b[0]\n"
+      "sdot z16.s, z8.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "sdot z17.s, z9.b, z0.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "sdot z18.s, z10.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "sdot z19.s, z4.b, z0.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "sdot z16.s, z5.b, z0.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "sdot z17.s, z6.b, z0.b[2]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "sdot z18.s, z7.b, z0.b[2]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "sdot z19.s, z8.b, z0.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "sdot z16.s, z9.b, z0.b[3]\n"
+      "sdot z17.s, z10.b, z0.b[3]\n"
+      "sdot z18.s, z4.b, z0.b[3]\n"
+      "sdot z19.s, z5.b, z0.b[3]\n"
+      "tbnz %x[flags], #31, 9f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "9:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      "bgt 8b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z6.b, z0.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z7.b, z0.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z18.s, z8.b, z0.b[0]\n"
+      "sdot z19.s, z9.b, z0.b[0]\n"
+      "ble 11f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z17.s, z4.b, z0.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z18.s, z5.b, z0.b[1]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z19.s, z6.b, z0.b[1]\n"
+      "ble 11f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z17.s, z8.b, z0.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z18.s, z9.b, z0.b[2]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z19.s, z10.b, z0.b[2]\n"
+      "ble 11f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z17.s, z5.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z18.s, z6.b, z0.b[3]\n"
+      "sdot z19.s, z7.b, z0.b[3]\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 12f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "12:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 5b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbnz %x[flags], #31, 13f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z1.s }, p2/Z, [x19]\n"
+      "neg z1.s, p2/M, z1.s\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x19\n"
+      "saddv d11, p0, z11.s\n"
+      "mov z11.s, z11.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z1.s\n"
+      "13:"  // Height 1: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z17.s, z17.s, z1.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      "tbz %x[flags], #5, 14f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "14:"  // Height 1: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "15:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 3b\n"
+      "b 62f\n"
+      "16:"  // Height 2
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 17f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "add x25, x25, x19\n"
+      "b 18f\n"
+      "17:"  // Height 2: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "18:"  // Height 2: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "19:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "20:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 21f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x28, 22f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 22f\n"
+      "21:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "22:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "ble 25f\n"
+      "23:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z4.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z5.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "sdot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "sdot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "sdot z19.s, z7.b, z0.b[0]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "sdot z23.s, z7.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "sdot z16.s, z8.b, z0.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "sdot z20.s, z8.b, z1.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "sdot z17.s, z9.b, z0.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "sdot z21.s, z9.b, z1.b[1]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "sdot z18.s, z10.b, z0.b[1]\n"
+      "sdot z22.s, z10.b, z1.b[1]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "sdot z19.s, z4.b, z0.b[1]\n"
+      "sdot z23.s, z4.b, z1.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "sdot z16.s, z5.b, z0.b[2]\n"
+      "sdot z20.s, z5.b, z1.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "sdot z17.s, z6.b, z0.b[2]\n"
+      "sdot z21.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z7.b, z0.b[2]\n"
+      "sdot z22.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z8.b, z0.b[2]\n"
+      "sdot z23.s, z8.b, z1.b[2]\n"
+      "sdot z16.s, z9.b, z0.b[3]\n"
+      "sdot z20.s, z9.b, z1.b[3]\n"
+      "sdot z17.s, z10.b, z0.b[3]\n"
+      "sdot z21.s, z10.b, z1.b[3]\n"
+      "sdot z18.s, z4.b, z0.b[3]\n"
+      "sdot z22.s, z4.b, z1.b[3]\n"
+      "sdot z19.s, z5.b, z0.b[3]\n"
+      "sdot z23.s, z5.b, z1.b[3]\n"
+      "tbnz %x[flags], #31, 24f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z12.s, z1.b, z15.b\n"
+      "24:"  // Height 2: Multiply loop: unique 3: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x10\n"
+      "bgt 23b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z7.b, z0.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z20.s, z6.b, z1.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z21.s, z7.b, z1.b[0]\n"
+      "sdot z18.s, z8.b, z0.b[0]\n"
+      "sdot z22.s, z8.b, z1.b[0]\n"
+      "sdot z19.s, z9.b, z0.b[0]\n"
+      "sdot z23.s, z9.b, z1.b[0]\n"
+      "ble 26f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z20.s, z10.b, z1.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z17.s, z4.b, z0.b[1]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z21.s, z4.b, z1.b[1]\n"
+      "sdot z18.s, z5.b, z0.b[1]\n"
+      "sdot z22.s, z5.b, z1.b[1]\n"
+      "sdot z19.s, z6.b, z0.b[1]\n"
+      "sdot z23.s, z6.b, z1.b[1]\n"
+      "ble 26f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z20.s, z7.b, z1.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z17.s, z8.b, z0.b[2]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z21.s, z8.b, z1.b[2]\n"
+      "sdot z18.s, z9.b, z0.b[2]\n"
+      "sdot z22.s, z9.b, z1.b[2]\n"
+      "sdot z19.s, z10.b, z0.b[2]\n"
+      "sdot z23.s, z10.b, z1.b[2]\n"
+      "ble 26f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "sdot z20.s, z4.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z17.s, z5.b, z0.b[3]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z21.s, z5.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z0.b[3]\n"
+      "sdot z22.s, z6.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z0.b[3]\n"
+      "sdot z23.s, z7.b, z1.b[3]\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 27f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z12.s, z1.b, z15.b\n"
+      "27:"  // Height 2: Multiply loop: unique 4: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 20b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbnz %x[flags], #31, 28f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z2.s }, p2/Z, [x19]\n"
+      "neg z2.s, p2/M, z2.s\n"
+      "mov x20, #0x4\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "saddv d11, p0, z11.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "saddv d12, p0, z12.s\n"
+      "mov z11.s, z11.s[0]\n"
+      "mov z12.s, z12.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z2.s\n"
+      "mul z12.s, p2/M, z12.s, z2.s\n"
+      "28:"  // Height 2: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z20.s, z20.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z21.s, z21.s, z12.s\n"
+      "add z22.s, z22.s, z12.s\n"
+      "add z23.s, z23.s, z12.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      "tbz %x[flags], #5, 29f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "and z8.d, z20.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "and z9.d, z21.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z20.s, z20.s, z8.s\n"
+      "sqadd z21.s, z21.s, z9.s\n"
+      "sqadd z22.s, z22.s, z10.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "29:"  // Height 2: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "30:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 18b\n"
+      "b 62f\n"
+      "31:"  // Height 3
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 32f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "b 33f\n"
+      "32:"  // Height 3: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "33:"  // Height 3: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x28, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "37:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "ble 40f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z4.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z5.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z24.s, z4.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "sdot z25.s, z5.b, z2.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "sdot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "sdot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "sdot z26.s, z6.b, z2.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "sdot z19.s, z7.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "sdot z23.s, z7.b, z1.b[0]\n"
+      "sdot z27.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "sdot z16.s, z8.b, z0.b[1]\n"
+      "sdot z20.s, z8.b, z1.b[1]\n"
+      "sdot z24.s, z8.b, z2.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "sdot z17.s, z9.b, z0.b[1]\n"
+      "sdot z21.s, z9.b, z1.b[1]\n"
+      "sdot z25.s, z9.b, z2.b[1]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "sdot z18.s, z10.b, z0.b[1]\n"
+      "sdot z22.s, z10.b, z1.b[1]\n"
+      "sdot z26.s, z10.b, z2.b[1]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "sdot z19.s, z4.b, z0.b[1]\n"
+      "sdot z23.s, z4.b, z1.b[1]\n"
+      "sdot z27.s, z4.b, z2.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "sdot z16.s, z5.b, z0.b[2]\n"
+      "sdot z20.s, z5.b, z1.b[2]\n"
+      "sdot z24.s, z5.b, z2.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "sdot z17.s, z6.b, z0.b[2]\n"
+      "sdot z21.s, z6.b, z1.b[2]\n"
+      "sdot z25.s, z6.b, z2.b[2]\n"
+      "sdot z18.s, z7.b, z0.b[2]\n"
+      "sdot z22.s, z7.b, z1.b[2]\n"
+      "sdot z26.s, z7.b, z2.b[2]\n"
+      "sdot z19.s, z8.b, z0.b[2]\n"
+      "sdot z23.s, z8.b, z1.b[2]\n"
+      "sdot z27.s, z8.b, z2.b[2]\n"
+      "sdot z16.s, z9.b, z0.b[3]\n"
+      "sdot z20.s, z9.b, z1.b[3]\n"
+      "sdot z24.s, z9.b, z2.b[3]\n"
+      "sdot z17.s, z10.b, z0.b[3]\n"
+      "sdot z21.s, z10.b, z1.b[3]\n"
+      "sdot z25.s, z10.b, z2.b[3]\n"
+      "sdot z18.s, z4.b, z0.b[3]\n"
+      "sdot z22.s, z4.b, z1.b[3]\n"
+      "sdot z26.s, z4.b, z2.b[3]\n"
+      "sdot z19.s, z5.b, z0.b[3]\n"
+      "sdot z23.s, z5.b, z1.b[3]\n"
+      "sdot z27.s, z5.b, z2.b[3]\n"
+      "tbnz %x[flags], #31, 39f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z12.s, z1.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "39:"  // Height 3: Multiply loop: unique 5: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "bgt 38b\n"
+      "40:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z20.s, z6.b, z1.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z24.s, z6.b, z2.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z21.s, z7.b, z1.b[0]\n"
+      "sdot z25.s, z7.b, z2.b[0]\n"
+      "sdot z18.s, z8.b, z0.b[0]\n"
+      "sdot z22.s, z8.b, z1.b[0]\n"
+      "sdot z26.s, z8.b, z2.b[0]\n"
+      "sdot z19.s, z9.b, z0.b[0]\n"
+      "sdot z23.s, z9.b, z1.b[0]\n"
+      "sdot z27.s, z9.b, z2.b[0]\n"
+      "ble 41f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z20.s, z10.b, z1.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z24.s, z10.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z17.s, z4.b, z0.b[1]\n"
+      "sdot z21.s, z4.b, z1.b[1]\n"
+      "sdot z25.s, z4.b, z2.b[1]\n"
+      "sdot z18.s, z5.b, z0.b[1]\n"
+      "sdot z22.s, z5.b, z1.b[1]\n"
+      "sdot z26.s, z5.b, z2.b[1]\n"
+      "sdot z19.s, z6.b, z0.b[1]\n"
+      "sdot z23.s, z6.b, z1.b[1]\n"
+      "sdot z27.s, z6.b, z2.b[1]\n"
+      "ble 41f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z20.s, z7.b, z1.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z24.s, z7.b, z2.b[2]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z17.s, z8.b, z0.b[2]\n"
+      "sdot z21.s, z8.b, z1.b[2]\n"
+      "sdot z25.s, z8.b, z2.b[2]\n"
+      "sdot z18.s, z9.b, z0.b[2]\n"
+      "sdot z22.s, z9.b, z1.b[2]\n"
+      "sdot z26.s, z9.b, z2.b[2]\n"
+      "sdot z19.s, z10.b, z0.b[2]\n"
+      "sdot z23.s, z10.b, z1.b[2]\n"
+      "sdot z27.s, z10.b, z2.b[2]\n"
+      "ble 41f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "sdot z20.s, z4.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z24.s, z4.b, z2.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z17.s, z5.b, z0.b[3]\n"
+      "sdot z21.s, z5.b, z1.b[3]\n"
+      "sdot z25.s, z5.b, z2.b[3]\n"
+      "sdot z18.s, z6.b, z0.b[3]\n"
+      "sdot z22.s, z6.b, z1.b[3]\n"
+      "sdot z26.s, z6.b, z2.b[3]\n"
+      "sdot z19.s, z7.b, z0.b[3]\n"
+      "sdot z23.s, z7.b, z1.b[3]\n"
+      "sdot z27.s, z7.b, z2.b[3]\n"
+      "41:"  // Height 3: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 42f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z12.s, z1.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "42:"  // Height 3: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 35b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbnz %x[flags], #31, 43f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z3.s }, p2/Z, [x19]\n"
+      "neg z3.s, p2/M, z3.s\n"
+      "mov x20, #0x4\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "saddv d11, p0, z11.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "saddv d12, p0, z12.s\n"
+      "mov x19, #0x4\n"
+      "mov z11.s, z11.s[0]\n"
+      "whilelt p0.s, XZR, x19\n"
+      "mov z12.s, z12.s[0]\n"
+      "saddv d13, p0, z13.s\n"
+      "mul z11.s, p2/M, z11.s, z3.s\n"
+      "mul z12.s, p2/M, z12.s, z3.s\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z13.s, p2/M, z13.s, z3.s\n"
+      "43:"  // Height 3: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z20.s, z20.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z21.s, z21.s, z12.s\n"
+      "add z22.s, z22.s, z12.s\n"
+      "add z23.s, z23.s, z12.s\n"
+      "add z24.s, z24.s, z13.s\n"
+      "add z25.s, z25.s, z13.s\n"
+      "add z26.s, z26.s, z13.s\n"
+      "add z27.s, z27.s, z13.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
+      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
+      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "tbz %x[flags], #5, 44f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "and z8.d, z20.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "and z9.d, z21.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z20.s, z20.s, z8.s\n"
+      "sqadd z21.s, z21.s, z9.s\n"
+      "sqadd z22.s, z22.s, z10.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "and z6.d, z25.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z5.s\n"
+      "and z7.d, z26.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z8.d, z27.d, z0.d\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z26.s, z26.s, z7.s\n"
+      "sqadd z27.s, z27.s, z8.s\n"
+      "44:"  // Height 3: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "add z26.s, z26.s, z4.s\n"
+      "addvl x25, x25, #1\n"
+      "add z27.s, z27.s, z4.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "addvl x23, x23, #1\n"
+      "45:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 33b\n"
+      "b 62f\n"
+      "46:"  // Height 4
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 47f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "ldr x21, [%x[output_ptr], #0x18]\n"
+      "add x25, x25, x19\n"
+      "add %x[output_ptr], %x[output_ptr], #0x20\n"
+      "add x23, x23, x19\n"
+      "add x21, x21, x19\n"
+      "b 48f\n"
+      "47:"  // Height 4: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "add x21, x23, x19\n"
+      "add %x[output_ptr], x21, x19\n"
+      "48:"  // Height 4: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "49:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "50:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 51f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x28, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 52f\n"
+      "51:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "52:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "ble 55f\n"
+      "53:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z4.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z5.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z24.s, z4.b, z2.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "sdot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "sdot z25.s, z5.b, z2.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "sdot z28.s, z4.b, z3.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "sdot z29.s, z5.b, z3.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "sdot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "sdot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "sdot z26.s, z6.b, z2.b[0]\n"
+      "sdot z30.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "sdot z19.s, z7.b, z0.b[0]\n"
+      "sdot z23.s, z7.b, z1.b[0]\n"
+      "sdot z27.s, z7.b, z2.b[0]\n"
+      "sdot z31.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "sdot z16.s, z8.b, z0.b[1]\n"
+      "sdot z20.s, z8.b, z1.b[1]\n"
+      "sdot z24.s, z8.b, z2.b[1]\n"
+      "sdot z28.s, z8.b, z3.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "sdot z17.s, z9.b, z0.b[1]\n"
+      "sdot z21.s, z9.b, z1.b[1]\n"
+      "sdot z25.s, z9.b, z2.b[1]\n"
+      "sdot z29.s, z9.b, z3.b[1]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "sdot z18.s, z10.b, z0.b[1]\n"
+      "sdot z22.s, z10.b, z1.b[1]\n"
+      "sdot z26.s, z10.b, z2.b[1]\n"
+      "sdot z30.s, z10.b, z3.b[1]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "sdot z19.s, z4.b, z0.b[1]\n"
+      "sdot z23.s, z4.b, z1.b[1]\n"
+      "sdot z27.s, z4.b, z2.b[1]\n"
+      "sdot z31.s, z4.b, z3.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "sdot z16.s, z5.b, z0.b[2]\n"
+      "sdot z20.s, z5.b, z1.b[2]\n"
+      "sdot z24.s, z5.b, z2.b[2]\n"
+      "sdot z28.s, z5.b, z3.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "sdot z17.s, z6.b, z0.b[2]\n"
+      "sdot z21.s, z6.b, z1.b[2]\n"
+      "sdot z25.s, z6.b, z2.b[2]\n"
+      "sdot z29.s, z6.b, z3.b[2]\n"
+      "sdot z18.s, z7.b, z0.b[2]\n"
+      "sdot z22.s, z7.b, z1.b[2]\n"
+      "sdot z26.s, z7.b, z2.b[2]\n"
+      "sdot z30.s, z7.b, z3.b[2]\n"
+      "sdot z19.s, z8.b, z0.b[2]\n"
+      "sdot z23.s, z8.b, z1.b[2]\n"
+      "sdot z27.s, z8.b, z2.b[2]\n"
+      "sdot z31.s, z8.b, z3.b[2]\n"
+      "sdot z16.s, z9.b, z0.b[3]\n"
+      "sdot z20.s, z9.b, z1.b[3]\n"
+      "sdot z24.s, z9.b, z2.b[3]\n"
+      "sdot z28.s, z9.b, z3.b[3]\n"
+      "sdot z17.s, z10.b, z0.b[3]\n"
+      "sdot z21.s, z10.b, z1.b[3]\n"
+      "sdot z25.s, z10.b, z2.b[3]\n"
+      "sdot z29.s, z10.b, z3.b[3]\n"
+      "sdot z18.s, z4.b, z0.b[3]\n"
+      "sdot z22.s, z4.b, z1.b[3]\n"
+      "sdot z26.s, z4.b, z2.b[3]\n"
+      "sdot z30.s, z4.b, z3.b[3]\n"
+      "sdot z19.s, z5.b, z0.b[3]\n"
+      "sdot z23.s, z5.b, z1.b[3]\n"
+      "sdot z27.s, z5.b, z2.b[3]\n"
+      "sdot z31.s, z5.b, z3.b[3]\n"
+      "tbnz %x[flags], #31, 54f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z12.s, z1.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "sdot z14.s, z3.b, z15.b\n"
+      "54:"  // Height 4: Multiply loop: unique 7: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 53b\n"
+      "55:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "sdot z16.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z17.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z20.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z24.s, z6.b, z2.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "sdot z21.s, z7.b, z1.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z28.s, z6.b, z3.b[0]\n"
+      "sdot z25.s, z7.b, z2.b[0]\n"
+      "sdot z29.s, z7.b, z3.b[0]\n"
+      "sdot z18.s, z8.b, z0.b[0]\n"
+      "sdot z22.s, z8.b, z1.b[0]\n"
+      "sdot z26.s, z8.b, z2.b[0]\n"
+      "sdot z30.s, z8.b, z3.b[0]\n"
+      "sdot z19.s, z9.b, z0.b[0]\n"
+      "sdot z23.s, z9.b, z1.b[0]\n"
+      "sdot z27.s, z9.b, z2.b[0]\n"
+      "sdot z31.s, z9.b, z3.b[0]\n"
+      "ble 56f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z20.s, z10.b, z1.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z24.s, z10.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z28.s, z10.b, z3.b[1]\n"
+      "sdot z17.s, z4.b, z0.b[1]\n"
+      "sdot z21.s, z4.b, z1.b[1]\n"
+      "sdot z25.s, z4.b, z2.b[1]\n"
+      "sdot z29.s, z4.b, z3.b[1]\n"
+      "sdot z18.s, z5.b, z0.b[1]\n"
+      "sdot z22.s, z5.b, z1.b[1]\n"
+      "sdot z26.s, z5.b, z2.b[1]\n"
+      "sdot z30.s, z5.b, z3.b[1]\n"
+      "sdot z19.s, z6.b, z0.b[1]\n"
+      "sdot z23.s, z6.b, z1.b[1]\n"
+      "sdot z27.s, z6.b, z2.b[1]\n"
+      "sdot z31.s, z6.b, z3.b[1]\n"
+      "ble 56f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "sdot z20.s, z7.b, z1.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z24.s, z7.b, z2.b[2]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z28.s, z7.b, z3.b[2]\n"
+      "sdot z17.s, z8.b, z0.b[2]\n"
+      "sdot z21.s, z8.b, z1.b[2]\n"
+      "sdot z25.s, z8.b, z2.b[2]\n"
+      "sdot z29.s, z8.b, z3.b[2]\n"
+      "sdot z18.s, z9.b, z0.b[2]\n"
+      "sdot z22.s, z9.b, z1.b[2]\n"
+      "sdot z26.s, z9.b, z2.b[2]\n"
+      "sdot z30.s, z9.b, z3.b[2]\n"
+      "sdot z19.s, z10.b, z0.b[2]\n"
+      "sdot z23.s, z10.b, z1.b[2]\n"
+      "sdot z27.s, z10.b, z2.b[2]\n"
+      "sdot z31.s, z10.b, z3.b[2]\n"
+      "ble 56f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "sdot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "sdot z20.s, z4.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "sdot z24.s, z4.b, z2.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "sdot z28.s, z4.b, z3.b[3]\n"
+      "sdot z17.s, z5.b, z0.b[3]\n"
+      "sdot z21.s, z5.b, z1.b[3]\n"
+      "sdot z25.s, z5.b, z2.b[3]\n"
+      "sdot z29.s, z5.b, z3.b[3]\n"
+      "sdot z18.s, z6.b, z0.b[3]\n"
+      "sdot z22.s, z6.b, z1.b[3]\n"
+      "sdot z26.s, z6.b, z2.b[3]\n"
+      "sdot z30.s, z6.b, z3.b[3]\n"
+      "sdot z19.s, z7.b, z0.b[3]\n"
+      "sdot z23.s, z7.b, z1.b[3]\n"
+      "sdot z27.s, z7.b, z2.b[3]\n"
+      "sdot z31.s, z7.b, z3.b[3]\n"
+      "56:"  // Height 4: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 57f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z12.s, z1.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "sdot z14.s, z3.b, z15.b\n"
+      "57:"  // Height 4: Multiply loop: unique 8: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 50b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbnz %x[flags], #31, 58f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "neg z4.s, p2/M, z4.s\n"
+      "mov x20, #0x4\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "saddv d11, p0, z11.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "saddv d12, p0, z12.s\n"
+      "mov x19, #0x4\n"
+      "mov z11.s, z11.s[0]\n"
+      "whilelt p0.s, XZR, x19\n"
+      "mov x19, #0x4\n"
+      "mov z12.s, z12.s[0]\n"
+      "saddv d13, p0, z13.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "mul z11.s, p2/M, z11.s, z4.s\n"
+      "saddv d14, p0, z14.s\n"
+      "mul z12.s, p2/M, z12.s, z4.s\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z13.s, p2/M, z13.s, z4.s\n"
+      "mov z14.s, z14.s[0]\n"
+      "mul z14.s, p2/M, z14.s, z4.s\n"
+      "58:"  // Height 4: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z20.s, z20.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z21.s, z21.s, z12.s\n"
+      "add z22.s, z22.s, z12.s\n"
+      "add z23.s, z23.s, z12.s\n"
+      "add z24.s, z24.s, z13.s\n"
+      "add z25.s, z25.s, z13.s\n"
+      "add z26.s, z26.s, z13.s\n"
+      "add z27.s, z27.s, z13.s\n"
+      "add z28.s, z28.s, z14.s\n"
+      "add z29.s, z29.s, z14.s\n"
+      "add z30.s, z30.s, z14.s\n"
+      "add z31.s, z31.s, z14.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      "add z28.s, z28.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z29.s, z29.s, z1.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z31.s, z31.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
+      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
+      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
+      ".inst 0x04a477bd  // sqrdmulh z29.s, z29.s, z4.s\n"
+      ".inst 0x04a477de  // sqrdmulh z30.s, z30.s, z4.s\n"
+      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      "tbz %x[flags], #5, 59f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "and z8.d, z20.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "and z9.d, z21.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z20.s, z20.s, z8.s\n"
+      "sqadd z21.s, z21.s, z9.s\n"
+      "sqadd z22.s, z22.s, z10.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "and z6.d, z25.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z5.s\n"
+      "and z7.d, z26.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z8.d, z27.d, z0.d\n"
+      "and z9.d, z28.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "and z10.d, z29.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "and z4.d, z30.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z26.s, z26.s, z7.s\n"
+      "and z5.d, z31.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z27.s, z27.s, z8.s\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z9.s\n"
+      "sqadd z29.s, z29.s, z10.s\n"
+      "sqadd z30.s, z30.s, z4.s\n"
+      "sqadd z31.s, z31.s, z5.s\n"
+      "59:"  // Height 4: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "add z26.s, z26.s, z4.s\n"
+      "addvl x25, x25, #1\n"
+      "add z27.s, z27.s, z4.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "add z28.s, z28.s, z4.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "smin z28.s, p2/M, z28.s, z6.s\n"
+      ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
+      ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "smax z28.s, p2/M, z28.s, z5.s\n"
+      "add z29.s, z29.s, z4.s\n"
+      "add z30.s, z30.s, z4.s\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "smin z29.s, p2/M, z29.s, z6.s\n"
+      "addvl x23, x23, #1\n"
+      "smin z30.s, p2/M, z30.s, z6.s\n"
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "smax z29.s, p2/M, z29.s, z5.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      "smax z30.s, p2/M, z30.s, z5.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      "smin z31.s, p2/M, z31.s, z6.s\n"
+      "smax z31.s, p2/M, z31.s, z5.s\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p1, [x21]\n"
+      "addvl x21, x21, #1\n"
+      "60:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 48b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 62f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "61:"  // Update direct input
+      "mov x19, #0x4\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "62:"  // Exit
+
+      : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
new file mode 100644
index 0000000000..d8562898aa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<int8_t>, \
+   size_t, size_t, \
+   const int8_t *, \
+   IndirectOutputArg<int8_t>, \
+   const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_s8qs_dot_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8qs_dot_6x4VL
+{
+public:
+    typedef int8_t operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_s8qs_dot_6x4VL;
+
+    cls_sve_hybrid_s8qs_dot_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..4a4af6356c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
@@ -0,0 +1,2770 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8qs_dot_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+    struct KernelArgs {
+        const int32_t *multiplier_ptr = {};
+        const int32_t *shift_ptr = {};
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->per_channel_requant) {
+        flags |= 0x10;
+        ka.multiplier_ptr=qp->per_channel_muls + col_base;
+        ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+    }
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 71f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 57f\n"
+      "beq 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x15\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x11, #0x10\n"
+      "ble 9f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "bgt 8b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "ble 10f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "ble 10f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "ble 10f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "10:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 5b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "ld1w { z0.s }, p2/Z, [x16]\n"
+      "add z8.s, z8.s, z0.s\n"
+      "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+      "add z9.s, z9.s, z1.s\n"
+      "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+      "addvl x16, x16, #4\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "tbz %x[flags], #4, 11f\n"
+      "ld1w { z0.s }, p2/Z, [x17]\n"
+      "ld1w { z4.s }, p2/Z, [x8]\n"
+      "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+      "addvl x17, x17, #4\n"
+      "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+      "addvl x8, x8, #4\n"
+      "b 12f\n"
+      "11:"  // Height 1: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x19]\n"
+      "mov z1.d, z0.d\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "12:"  // Height 1: parameters loaded
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      "tbz %x[flags], #5, 13f\n"
+      "and z4.d, z8.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "13:"  // Height 1: no shift correction
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add z8.s, z8.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "14:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 3b\n"
+      "b 86f\n"
+      "15:"  // Height 2
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 16f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "17:"  // Height 2: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x15\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "18:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "19:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 21f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "b 21f\n"
+      "20:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "21:"  // Height 2: input setup done
+      "cmp x11, #0x10\n"
+      "ble 23f\n"
+      "22:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "bgt 22b\n"
+      "23:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "ble 24f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "ble 24f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "ble 24f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "24:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 19b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "ld1w { z0.s }, p2/Z, [x16]\n"
+      "add z8.s, z8.s, z0.s\n"
+      "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+      "add z12.s, z12.s, z0.s\n"
+      "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+      "add z9.s, z9.s, z1.s\n"
+      "addvl x16, x16, #4\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z14.s, z14.s, z2.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "tbz %x[flags], #4, 25f\n"
+      "ld1w { z0.s }, p2/Z, [x17]\n"
+      "ld1w { z4.s }, p2/Z, [x8]\n"
+      "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+      "addvl x17, x17, #4\n"
+      "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+      "addvl x8, x8, #4\n"
+      "b 26f\n"
+      "25:"  // Height 2: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x19]\n"
+      "mov z1.d, z0.d\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "26:"  // Height 2: parameters loaded
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a4758c  // sqrdmulh z12.s, z12.s, z4.s\n"
+      ".inst 0x04a575ad  // sqrdmulh z13.s, z13.s, z5.s\n"
+      ".inst 0x04a675ce  // sqrdmulh z14.s, z14.s, z6.s\n"
+      ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
+      "tbz %x[flags], #5, 27f\n"
+      "and z4.d, z8.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z4.d, z12.d, z0.d\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "and z5.d, z13.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z4.s\n"
+      "and z7.d, z15.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "sqadd z14.s, z14.s, z6.s\n"
+      "sqadd z15.s, z15.s, z7.s\n"
+      "27:"  // Height 2: no shift correction
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x13]\n"
+      "add z13.s, z13.s, z4.s\n"
+      "addvl x13, x13, #1\n"
+      ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "28:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 17b\n"
+      "b 86f\n"
+      "29:"  // Height 3
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 30f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "add x27, x27, x19\n"
+      "b 31f\n"
+      "30:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "31:"  // Height 3: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x15\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "32:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "33:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 34f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 35f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "b 35f\n"
+      "34:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "35:"  // Height 3: input setup done
+      "cmp x11, #0x10\n"
+      "ble 37f\n"
+      "36:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "bgt 36b\n"
+      "37:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "ble 38f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "ble 38f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "ble 38f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "38:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 33b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "ld1w { z0.s }, p2/Z, [x16]\n"
+      "add z8.s, z8.s, z0.s\n"
+      "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+      "add z12.s, z12.s, z0.s\n"
+      "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+      "addvl x16, x16, #4\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z14.s, z14.s, z2.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "tbz %x[flags], #4, 39f\n"
+      "ld1w { z0.s }, p2/Z, [x17]\n"
+      "ld1w { z4.s }, p2/Z, [x8]\n"
+      "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+      "addvl x17, x17, #4\n"
+      "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+      "addvl x8, x8, #4\n"
+      "b 40f\n"
+      "39:"  // Height 3: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x19]\n"
+      "mov z1.d, z0.d\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "40:"  // Height 3: parameters loaded
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a4758c  // sqrdmulh z12.s, z12.s, z4.s\n"
+      ".inst 0x04a575ad  // sqrdmulh z13.s, z13.s, z5.s\n"
+      ".inst 0x04a675ce  // sqrdmulh z14.s, z14.s, z6.s\n"
+      ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      "tbz %x[flags], #5, 41f\n"
+      "and z4.d, z8.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z4.d, z12.d, z0.d\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "and z5.d, z13.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z4.s\n"
+      "and z7.d, z15.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z6.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z7.s\n"
+      "and z6.d, z18.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z7.d, z19.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "41:"  // Height 3: no shift correction
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x13]\n"
+      "add z13.s, z13.s, z4.s\n"
+      "addvl x13, x13, #1\n"
+      ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p1, [x9]\n"
+      "add z18.s, z18.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "42:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 31b\n"
+      "b 86f\n"
+      "43:"  // Height 4
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 44f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19\n"
+      "add x25, x25, x19\n"
+      "b 45f\n"
+      "44:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "add x25, x27, x19\n"
+      "45:"  // Height 4: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x15\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "46:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "47:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 48f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 49f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 49f\n"
+      "48:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "49:"  // Height 4: input setup done
+      "cmp x11, #0x10\n"
+      "ble 51f\n"
+      "50:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "bgt 50b\n"
+      "51:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "ble 52f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "ble 52f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "ble 52f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "52:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 47b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "ld1w { z0.s }, p2/Z, [x16]\n"
+      "add z8.s, z8.s, z0.s\n"
+      "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+      "add z12.s, z12.s, z0.s\n"
+      "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+      "addvl x16, x16, #4\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z14.s, z14.s, z2.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "tbz %x[flags], #4, 53f\n"
+      "ld1w { z0.s }, p2/Z, [x17]\n"
+      "ld1w { z4.s }, p2/Z, [x8]\n"
+      "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+      "addvl x17, x17, #4\n"
+      "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+      "addvl x8, x8, #4\n"
+      "b 54f\n"
+      "53:"  // Height 4: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x19]\n"
+      "mov z1.d, z0.d\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "54:"  // Height 4: parameters loaded
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a4758c  // sqrdmulh z12.s, z12.s, z4.s\n"
+      ".inst 0x04a575ad  // sqrdmulh z13.s, z13.s, z5.s\n"
+      ".inst 0x04a675ce  // sqrdmulh z14.s, z14.s, z6.s\n"
+      ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a576b5  // sqrdmulh z21.s, z21.s, z5.s\n"
+      ".inst 0x04a676d6  // sqrdmulh z22.s, z22.s, z6.s\n"
+      ".inst 0x04a776f7  // sqrdmulh z23.s, z23.s, z7.s\n"
+      "tbz %x[flags], #5, 55f\n"
+      "and z4.d, z8.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z4.d, z12.d, z0.d\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "and z5.d, z13.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z4.s\n"
+      "and z7.d, z15.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z6.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z7.s\n"
+      "and z6.d, z18.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z7.d, z19.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "and z4.d, z20.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "and z5.d, z21.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "and z6.d, z22.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z4.s\n"
+      "and z7.d, z23.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z5.s\n"
+      "sqadd z22.s, z22.s, z6.s\n"
+      "sqadd z23.s, z23.s, z7.s\n"
+      "55:"  // Height 4: no shift correction
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x13]\n"
+      "add z13.s, z13.s, z4.s\n"
+      "addvl x13, x13, #1\n"
+      ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p1, [x9]\n"
+      "add z18.s, z18.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      ".inst 0x44828835  // srshl z21.s, p2/M, z21.s, z1.s\n"
+      ".inst 0x44828856  // srshl z22.s, p2/M, z22.s, z2.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "addvl x27, x27, #1\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      ".inst 0x44828877  // srshl z23.s, p2/M, z23.s, z3.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "56:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 45b\n"
+      "b 86f\n"
+      "57:"  // Height 5
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 58f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "b 59f\n"
+      "58:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "add x25, x27, x19\n"
+      "add x23, x25, x19\n"
+      "59:"  // Height 5: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x15\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "60:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "61:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 62f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 63f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 63f\n"
+      "62:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "63:"  // Height 5: input setup done
+      "cmp x11, #0x10\n"
+      "ble 65f\n"
+      "64:"  // Height 5: Multiply loop: Main loop head
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "bgt 64b\n"
+      "65:"  // Height 5: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "ble 66f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "ble 66f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "ble 66f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "66:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 61b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "ld1w { z0.s }, p2/Z, [x16]\n"
+      "add z8.s, z8.s, z0.s\n"
+      "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+      "add z12.s, z12.s, z0.s\n"
+      "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+      "addvl x16, x16, #4\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z14.s, z14.s, z2.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      "tbz %x[flags], #4, 67f\n"
+      "ld1w { z0.s }, p2/Z, [x17]\n"
+      "ld1w { z4.s }, p2/Z, [x8]\n"
+      "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+      "addvl x17, x17, #4\n"
+      "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+      "addvl x8, x8, #4\n"
+      "b 68f\n"
+      "67:"  // Height 5: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x19]\n"
+      "mov z1.d, z0.d\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "68:"  // Height 5: parameters loaded
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a4758c  // sqrdmulh z12.s, z12.s, z4.s\n"
+      ".inst 0x04a575ad  // sqrdmulh z13.s, z13.s, z5.s\n"
+      ".inst 0x04a675ce  // sqrdmulh z14.s, z14.s, z6.s\n"
+      ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a576b5  // sqrdmulh z21.s, z21.s, z5.s\n"
+      ".inst 0x04a676d6  // sqrdmulh z22.s, z22.s, z6.s\n"
+      ".inst 0x04a776f7  // sqrdmulh z23.s, z23.s, z7.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a57739  // sqrdmulh z25.s, z25.s, z5.s\n"
+      ".inst 0x04a6775a  // sqrdmulh z26.s, z26.s, z6.s\n"
+      ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
+      "tbz %x[flags], #5, 69f\n"
+      "and z4.d, z8.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z4.d, z12.d, z0.d\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "and z5.d, z13.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z4.s\n"
+      "and z7.d, z15.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z6.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z7.s\n"
+      "and z6.d, z18.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z7.d, z19.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "and z4.d, z20.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "and z5.d, z21.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "and z6.d, z22.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z4.s\n"
+      "and z7.d, z23.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z5.s\n"
+      "and z4.d, z24.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z22.s, z22.s, z6.s\n"
+      "and z5.d, z25.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z7.s\n"
+      "and z6.d, z26.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z4.s\n"
+      "and z7.d, z27.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z5.s\n"
+      "sqadd z26.s, z26.s, z6.s\n"
+      "sqadd z27.s, z27.s, z7.s\n"
+      "69:"  // Height 5: no shift correction
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x13]\n"
+      "add z13.s, z13.s, z4.s\n"
+      "addvl x13, x13, #1\n"
+      ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p1, [x9]\n"
+      "add z18.s, z18.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      ".inst 0x44828835  // srshl z21.s, p2/M, z21.s, z1.s\n"
+      ".inst 0x44828856  // srshl z22.s, p2/M, z22.s, z2.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "addvl x27, x27, #1\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      ".inst 0x44828877  // srshl z23.s, p2/M, z23.s, z3.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482885a  // srshl z26.s, p2/M, z26.s, z2.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "add z26.s, z26.s, z4.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "add z27.s, z27.s, z4.s\n"
+      "addvl x25, x25, #1\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "addvl x23, x23, #1\n"
+      "70:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 59b\n"
+      "b 86f\n"
+      "71:"  // Height 6
+      "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x16, %x[col_bias]\n"
+      "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 72f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "add x21, x21, x19\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19\n"
+      "add x27, x9, x19\n"
+      "add x25, x27, x19\n"
+      "add x23, x25, x19\n"
+      "add x21, x23, x19\n"
+      "add %x[output_ptr], x21, x19\n"
+      "73:"  // Height 6: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x15\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "74:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "75:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 76f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 77f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 77f\n"
+      "76:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "77:"  // Height 6: input setup done
+      "cmp x11, #0x10\n"
+      "ble 79f\n"
+      "78:"  // Height 6: Multiply loop: Main loop head
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "sdot z28.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sdot z29.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "sdot z30.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "sdot z31.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "sdot z28.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "sdot z29.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "sdot z30.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "sdot z31.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "sdot z28.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "sdot z29.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "sdot z30.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "sdot z31.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "sdot z28.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "sdot z29.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z30.s, z6.b, z5.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z31.s, z7.b, z5.b[3]\n"
+      "bgt 78b\n"
+      "79:"  // Height 6: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "add x20, x20, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "sdot z28.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "sdot z29.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "sdot z30.s, z6.b, z5.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "sdot z31.s, z7.b, z5.b[0]\n"
+      "ble 80f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "sdot z28.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "sdot z29.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "sdot z30.s, z6.b, z5.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "sdot z31.s, z7.b, z5.b[1]\n"
+      "ble 80f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "sdot z28.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "sdot z29.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "sdot z30.s, z6.b, z5.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "sdot z31.s, z7.b, z5.b[2]\n"
+      "ble 80f\n"
+      "ld1b { z6.b }, p2/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "sdot z28.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "sdot z29.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z30.s, z6.b, z5.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z31.s, z7.b, z5.b[3]\n"
+      "80:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 75b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "ld1w { z0.s }, p2/Z, [x16]\n"
+      "add z8.s, z8.s, z0.s\n"
+      "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+      "add z12.s, z12.s, z0.s\n"
+      "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+      "addvl x16, x16, #4\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z13.s, z13.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z14.s, z14.s, z2.s\n"
+      "add z15.s, z15.s, z3.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      "add z28.s, z28.s, z0.s\n"
+      "add z29.s, z29.s, z1.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z31.s, z31.s, z3.s\n"
+      "tbz %x[flags], #4, 81f\n"
+      "ld1w { z0.s }, p2/Z, [x17]\n"
+      "ld1w { z4.s }, p2/Z, [x8]\n"
+      "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+      "addvl x17, x17, #4\n"
+      "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+      "addvl x8, x8, #4\n"
+      "b 82f\n"
+      "81:"  // Height 6: per layer parameters
+      "add x19, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x19]\n"
+      "mov z1.d, z0.d\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "82:"  // Height 6: parameters loaded
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a4758c  // sqrdmulh z12.s, z12.s, z4.s\n"
+      ".inst 0x04a575ad  // sqrdmulh z13.s, z13.s, z5.s\n"
+      ".inst 0x04a675ce  // sqrdmulh z14.s, z14.s, z6.s\n"
+      ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a576b5  // sqrdmulh z21.s, z21.s, z5.s\n"
+      ".inst 0x04a676d6  // sqrdmulh z22.s, z22.s, z6.s\n"
+      ".inst 0x04a776f7  // sqrdmulh z23.s, z23.s, z7.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a57739  // sqrdmulh z25.s, z25.s, z5.s\n"
+      ".inst 0x04a6775a  // sqrdmulh z26.s, z26.s, z6.s\n"
+      ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
+      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
+      ".inst 0x04a577bd  // sqrdmulh z29.s, z29.s, z5.s\n"
+      ".inst 0x04a677de  // sqrdmulh z30.s, z30.s, z6.s\n"
+      ".inst 0x04a777ff  // sqrdmulh z31.s, z31.s, z7.s\n"
+      "tbz %x[flags], #5, 83f\n"
+      "and z4.d, z8.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z4.d, z12.d, z0.d\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "and z5.d, z13.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z12.s, z12.s, z4.s\n"
+      "and z7.d, z15.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z6.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z7.s\n"
+      "and z6.d, z18.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z7.d, z19.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "and z4.d, z20.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "and z5.d, z21.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "and z6.d, z22.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z4.s\n"
+      "and z7.d, z23.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z21.s, z21.s, z5.s\n"
+      "and z4.d, z24.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z22.s, z22.s, z6.s\n"
+      "and z5.d, z25.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z7.s\n"
+      "and z6.d, z26.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z4.s\n"
+      "and z7.d, z27.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z5.s\n"
+      "and z4.d, z28.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z26.s, z26.s, z6.s\n"
+      "and z5.d, z29.d, z1.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z27.s, z27.s, z7.s\n"
+      "and z6.d, z30.d, z2.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z4.s\n"
+      "and z7.d, z31.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z29.s, z29.s, z5.s\n"
+      "sqadd z30.s, z30.s, z6.s\n"
+      "sqadd z31.s, z31.s, z7.s\n"
+      "83:"  // Height 6: no shift correction
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x4482880c  // srshl z12.s, p2/M, z12.s, z0.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      ".inst 0x4482882d  // srshl z13.s, p2/M, z13.s, z1.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x13]\n"
+      "add z13.s, z13.s, z4.s\n"
+      "addvl x13, x13, #1\n"
+      ".inst 0x4482884e  // srshl z14.s, p2/M, z14.s, z2.s\n"
+      ".inst 0x4482886f  // srshl z15.s, p2/M, z15.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z12.h, z12.h, z13.h\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "uzp1 z13.h, z14.h, z15.h\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      "uzp1 z12.b, z12.b, z13.b\n"
+      "st1b { z12.b }, p1, [x9]\n"
+      "add z18.s, z18.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      ".inst 0x44828835  // srshl z21.s, p2/M, z21.s, z1.s\n"
+      ".inst 0x44828856  // srshl z22.s, p2/M, z22.s, z2.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x27]\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "addvl x27, x27, #1\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      ".inst 0x44828877  // srshl z23.s, p2/M, z23.s, z3.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482885a  // srshl z26.s, p2/M, z26.s, z2.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "add z26.s, z26.s, z4.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "add z27.s, z27.s, z4.s\n"
+      "addvl x25, x25, #1\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
+      ".inst 0x4482883d  // srshl z29.s, p2/M, z29.s, z1.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      ".inst 0x4482885e  // srshl z30.s, p2/M, z30.s, z2.s\n"
+      "add z28.s, z28.s, z4.s\n"
+      "add z29.s, z29.s, z4.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "add z30.s, z30.s, z4.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "smin z28.s, p2/M, z28.s, z6.s\n"
+      "smin z29.s, p2/M, z29.s, z6.s\n"
+      "smin z30.s, p2/M, z30.s, z6.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "smax z28.s, p2/M, z28.s, z5.s\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "smax z29.s, p2/M, z29.s, z5.s\n"
+      "addvl x23, x23, #1\n"
+      "smax z30.s, p2/M, z30.s, z5.s\n"
+      ".inst 0x4482887f  // srshl z31.s, p2/M, z31.s, z3.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      "add z31.s, z31.s, z4.s\n"
+      "smin z31.s, p2/M, z31.s, z6.s\n"
+      "smax z31.s, p2/M, z31.s, z5.s\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p1, [x21]\n"
+      "addvl x21, x21, #1\n"
+      "84:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 73b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 86f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 85f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "85:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "86:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
deleted file mode 100644
index b30b8845a6..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2137 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long leftovers = K;
-    const long blocks_count = (K + 3) / 4;
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const int8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(int8_t);
-
-        int32_t *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const int8_t *a_ptr0 = a_ptr0_base;
-            const int8_t *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(int32_t);
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z18.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z19.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "mov z18.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z19.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z20.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z21.s, #0\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "mov z18.s, #0\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "mov z19.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z20.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z21.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z24.s, #0\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z25.s, #0\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z26.s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z27.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "mov z18.s, #0\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "mov z19.s, #0\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                        "mov z20.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z21.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z24.s, #0\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z25.s, #0\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z26.s, #0\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z27.s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z28.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "mov z29.s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "mov z30.s, #0\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z31.s, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1w z28.s, p0/z, [c_ptr3]\n"
-                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z28.s, z12.b, z3.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z30.s, z14.b, z3.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
-                        "sdot z31.s, z15.b, z3.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "sdot z28.s, z8.b, z7.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z30.s, z10.b, z7.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "sdot z31.s, z11.b, z7.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z28.s, z12.b, z7.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z30.s, z14.b, z7.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "sdot z31.s, z15.b, z7.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z28.s, z8.b, z7.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z30.s, z10.b, z7.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z31.s, z11.b, z7.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z28.s, z12.b, z7.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z29.s, z13.b, z7.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z30.s, z14.b, z7.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "sdot z31.s, z15.b, z7.b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z28.s, z12.b, z3.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z30.s, z14.b, z3.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                        "sdot z31.s, z15.b, z3.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "sdot z28.s, z8.b, z7.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z30.s, z10.b, z7.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "sdot z31.s, z11.b, z7.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z28.s, z12.b, z7.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z30.s, z14.b, z7.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "sdot z31.s, z15.b, z7.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z28.s, z8.b, z7.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z30.s, z10.b, z7.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z31.s, z11.b, z7.b[2]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z28.s, z12.b, z7.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z29.s, z13.b, z7.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z30.s, z14.b, z7.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "sdot z31.s, z15.b, z7.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z28.s, z12.b, z3.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z30.s, z14.b, z3.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "sdot z31.s, z15.b, z3.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z28.s, z12.b, z3.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z30.s, z14.b, z3.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "sdot z31.s, z15.b, z3.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "sdot z28.s, z8.b, z7.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z30.s, z10.b, z7.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "sdot z31.s, z11.b, z7.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z28.s, z12.b, z7.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z30.s, z14.b, z7.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "sdot z31.s, z15.b, z7.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z28.s, z8.b, z7.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z30.s, z10.b, z7.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z31.s, z11.b, z7.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z28.s, z12.b, z7.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z29.s, z13.b, z7.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z30.s, z14.b, z7.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "sdot z31.s, z15.b, z7.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z28.s, p0, [c_ptr3]\n"
-                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
new file mode 100644
index 0000000000..1aebedb861
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<int8_t>, \
+   size_t, size_t, \
+   const int8_t *, \
+   IndirectOutputArg<int32_t>, \
+   const int32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_s8s32_dot_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8s32_dot_6x4VL
+{
+public:
+    typedef int8_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_s8s32_dot_6x4VL;
+
+    cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..cae9bf329f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
@@ -0,0 +1,1904 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8s32_dot_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+    const int32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 61f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 49f\n"
+      "beq 37f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 25f\n"
+      "beq 13f\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 8f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "cmp x11, #0x10\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "ble 11f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "ble 11f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "ble 11f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 6b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "12:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 3b\n"
+      "b 74f\n"
+      "13:"  // Height 2
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 14f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 15f\n"
+      "14:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "15:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 16f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "b 17f\n"
+      "16:"  // Height 2: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 20f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "20:"  // Height 2: input setup done
+      "cmp x11, #0x10\n"
+      "ble 22f\n"
+      "21:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "bgt 21b\n"
+      "22:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "ble 23f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "ble 23f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "ble 23f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "23:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 18b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "24:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 15b\n"
+      "b 74f\n"
+      "25:"  // Height 3
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 26f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 27f\n"
+      "26:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "27:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 28f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "b 29f\n"
+      "28:"  // Height 3: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "29:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "30:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 31f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 32f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "b 32f\n"
+      "31:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "32:"  // Height 3: input setup done
+      "cmp x11, #0x10\n"
+      "ble 34f\n"
+      "33:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "bgt 33b\n"
+      "34:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "ble 35f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "ble 35f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "ble 35f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "35:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 30b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "36:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 27b\n"
+      "b 74f\n"
+      "37:"  // Height 4
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 38f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 39f\n"
+      "38:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "39:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 40f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "b 41f\n"
+      "40:"  // Height 4: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "41:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "42:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 43f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 44f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 44f\n"
+      "43:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "44:"  // Height 4: input setup done
+      "cmp x11, #0x10\n"
+      "ble 46f\n"
+      "45:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "bgt 45b\n"
+      "46:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "ble 47f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "ble 47f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "ble 47f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "47:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 42b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "48:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 39b\n"
+      "b 74f\n"
+      "49:"  // Height 5
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 50f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 51f\n"
+      "50:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "51:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 52f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "b 53f\n"
+      "52:"  // Height 5: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "53:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "54:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 55f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 56f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 56f\n"
+      "55:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "56:"  // Height 5: input setup done
+      "cmp x11, #0x10\n"
+      "ble 58f\n"
+      "57:"  // Height 5: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "bgt 57b\n"
+      "58:"  // Height 5: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "ble 59f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "ble 59f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "ble 59f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "59:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 54b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "60:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 51b\n"
+      "b 74f\n"
+      "61:"  // Height 6
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 62f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 63f\n"
+      "62:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "63:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 64f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x21]\n"
+      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "b 65f\n"
+      "64:"  // Height 6: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "65:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "66:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 68f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 68f\n"
+      "67:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "68:"  // Height 6: input setup done
+      "cmp x11, #0x10\n"
+      "ble 70f\n"
+      "69:"  // Height 6: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "sdot z28.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "sdot z29.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "sdot z30.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "sdot z31.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "sdot z28.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "sdot z29.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "sdot z30.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "sdot z31.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "sdot z28.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "sdot z29.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "sdot z30.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "sdot z31.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "sdot z28.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "sdot z29.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z30.s, z6.b, z5.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z31.s, z7.b, z5.b[3]\n"
+      "bgt 69b\n"
+      "70:"  // Height 6: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "sdot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "sdot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
+      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z20.s, z6.b, z3.b[0]\n"
+      "add x20, x20, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z24.s, z6.b, z4.b[0]\n"
+      "sdot z28.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z21.s, z7.b, z3.b[0]\n"
+      "sdot z25.s, z7.b, z4.b[0]\n"
+      "sdot z29.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z14.s, z6.b, z1.b[0]\n"
+      "sdot z18.s, z6.b, z2.b[0]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z26.s, z6.b, z4.b[0]\n"
+      "sdot z30.s, z6.b, z5.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[0]\n"
+      "sdot z15.s, z7.b, z1.b[0]\n"
+      "sdot z19.s, z7.b, z2.b[0]\n"
+      "sdot z23.s, z7.b, z3.b[0]\n"
+      "sdot z27.s, z7.b, z4.b[0]\n"
+      "sdot z31.s, z7.b, z5.b[0]\n"
+      "ble 71f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[1]\n"
+      "sdot z20.s, z6.b, z3.b[1]\n"
+      "sdot z24.s, z6.b, z4.b[1]\n"
+      "sdot z28.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[1]\n"
+      "sdot z13.s, z7.b, z1.b[1]\n"
+      "sdot z17.s, z7.b, z2.b[1]\n"
+      "sdot z21.s, z7.b, z3.b[1]\n"
+      "sdot z25.s, z7.b, z4.b[1]\n"
+      "sdot z29.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[1]\n"
+      "sdot z14.s, z6.b, z1.b[1]\n"
+      "sdot z18.s, z6.b, z2.b[1]\n"
+      "sdot z22.s, z6.b, z3.b[1]\n"
+      "sdot z26.s, z6.b, z4.b[1]\n"
+      "sdot z30.s, z6.b, z5.b[1]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z15.s, z7.b, z1.b[1]\n"
+      "sdot z19.s, z7.b, z2.b[1]\n"
+      "sdot z23.s, z7.b, z3.b[1]\n"
+      "sdot z27.s, z7.b, z4.b[1]\n"
+      "sdot z31.s, z7.b, z5.b[1]\n"
+      "ble 71f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "sdot z12.s, z6.b, z1.b[2]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z20.s, z6.b, z3.b[2]\n"
+      "sdot z24.s, z6.b, z4.b[2]\n"
+      "sdot z28.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[2]\n"
+      "sdot z13.s, z7.b, z1.b[2]\n"
+      "sdot z17.s, z7.b, z2.b[2]\n"
+      "sdot z21.s, z7.b, z3.b[2]\n"
+      "sdot z25.s, z7.b, z4.b[2]\n"
+      "sdot z29.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[2]\n"
+      "sdot z14.s, z6.b, z1.b[2]\n"
+      "sdot z18.s, z6.b, z2.b[2]\n"
+      "sdot z22.s, z6.b, z3.b[2]\n"
+      "sdot z26.s, z6.b, z4.b[2]\n"
+      "sdot z30.s, z6.b, z5.b[2]\n"
+      "sdot z11.s, z7.b, z0.b[2]\n"
+      "sdot z15.s, z7.b, z1.b[2]\n"
+      "sdot z19.s, z7.b, z2.b[2]\n"
+      "sdot z23.s, z7.b, z3.b[2]\n"
+      "sdot z27.s, z7.b, z4.b[2]\n"
+      "sdot z31.s, z7.b, z5.b[2]\n"
+      "ble 71f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "sdot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sdot z12.s, z6.b, z1.b[3]\n"
+      "sdot z16.s, z6.b, z2.b[3]\n"
+      "sdot z20.s, z6.b, z3.b[3]\n"
+      "sdot z24.s, z6.b, z4.b[3]\n"
+      "sdot z28.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b[3]\n"
+      "sdot z13.s, z7.b, z1.b[3]\n"
+      "sdot z17.s, z7.b, z2.b[3]\n"
+      "sdot z21.s, z7.b, z3.b[3]\n"
+      "sdot z25.s, z7.b, z4.b[3]\n"
+      "sdot z29.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "sdot z10.s, z6.b, z0.b[3]\n"
+      "sdot z14.s, z6.b, z1.b[3]\n"
+      "sdot z18.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[3]\n"
+      "sdot z26.s, z6.b, z4.b[3]\n"
+      "sdot z30.s, z6.b, z5.b[3]\n"
+      "sdot z11.s, z7.b, z0.b[3]\n"
+      "sdot z15.s, z7.b, z1.b[3]\n"
+      "sdot z19.s, z7.b, z2.b[3]\n"
+      "sdot z23.s, z7.b, z3.b[3]\n"
+      "sdot z27.s, z7.b, z4.b[3]\n"
+      "sdot z31.s, z7.b, z5.b[3]\n"
+      "71:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 66b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "st1w { z28.s }, p4, [x21]\n"
+      "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+      "addvl x21, x21, #4\n"
+      "72:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 63b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 74f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 73f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "73:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "74:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
similarity index 66%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
index c325e522d7..964f7cc2c1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,37 +10,43 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
-
 #ifdef __ARM_FEATURE_SVE
 
-#include <cstdint>
 #include "../std_transforms_sve.hpp"
 
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<uint8_t>, \
+   size_t, size_t, \
+   const uint8_t *, \
+   IndirectOutputArg<uint8_t>, \
+   const Requantize32 *, const int32_t *, unsigned int
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void sve_hybrid_u8qa_dot_4x4VL( ARGLIST );
 
-class hybrid_u8u32_dot_4VLx4
+class cls_sve_hybrid_u8qa_dot_4x4VL
 {
 public:
     typedef uint8_t operand_type;
-    typedef uint32_t result_type;
+    typedef uint8_t result_type;
 
-    typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
@@ -59,16 +65,6 @@ class hybrid_u8u32_dot_4VLx4
     }
 
     static constexpr bool supports_accumulate()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_activation()
     {
         return false;
     }
@@ -76,14 +72,14 @@ class hybrid_u8u32_dot_4VLx4
     StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_hybrid_u8u32_dot_4VLx4;
+    kern_type kernel=sve_hybrid_u8qa_dot_4x4VL;
 
-    hybrid_u8u32_dot_4VLx4(const CPUInfo *)
+    cls_sve_hybrid_u8qa_dot_4x4VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
 #endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
new file mode 100644
index 0000000000..0a6546b78a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
@@ -0,0 +1,1602 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8qa_dot_4x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 46f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 31f\n"
+      "beq 16f\n"
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "add x9, x9, x19\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x28, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "cbnz x28, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "ble 10f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z4.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z5.b, z0.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "udot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "udot z19.s, z7.b, z0.b[0]\n"
+      "udot z16.s, z8.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "udot z17.s, z9.b, z0.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "udot z18.s, z10.b, z0.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "udot z19.s, z4.b, z0.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "udot z16.s, z5.b, z0.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "udot z17.s, z6.b, z0.b[2]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "udot z18.s, z7.b, z0.b[2]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "udot z19.s, z8.b, z0.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "udot z16.s, z9.b, z0.b[3]\n"
+      "udot z17.s, z10.b, z0.b[3]\n"
+      "udot z18.s, z4.b, z0.b[3]\n"
+      "udot z19.s, z5.b, z0.b[3]\n"
+      "tbnz %x[flags], #31, 9f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "9:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x10\n"
+      "bgt 8b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z6.b, z0.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z7.b, z0.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z18.s, z8.b, z0.b[0]\n"
+      "udot z19.s, z9.b, z0.b[0]\n"
+      "ble 11f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "udot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z17.s, z4.b, z0.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z18.s, z5.b, z0.b[1]\n"
+      "addvl x11, x11, #4\n"
+      "udot z19.s, z6.b, z0.b[1]\n"
+      "ble 11f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "udot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z17.s, z8.b, z0.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z18.s, z9.b, z0.b[2]\n"
+      "addvl x11, x11, #4\n"
+      "udot z19.s, z10.b, z0.b[2]\n"
+      "ble 11f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "udot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z17.s, z5.b, z0.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z18.s, z6.b, z0.b[3]\n"
+      "udot z19.s, z7.b, z0.b[3]\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 12f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "12:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 5b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "tbnz %x[flags], #31, 13f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z1.s }, p2/Z, [x19]\n"
+      "neg z1.s, p2/M, z1.s\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x19\n"
+      "uaddv d11, p0, z11.s\n"
+      "mov z11.s, z11.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z1.s\n"
+      "13:"  // Height 1: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z17.s, z17.s, z1.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      "tbz %x[flags], #5, 14f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "14:"  // Height 1: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "15:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 3b\n"
+      "b 62f\n"
+      "16:"  // Height 2
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 17f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "add x25, x25, x19\n"
+      "b 18f\n"
+      "17:"  // Height 2: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "18:"  // Height 2: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "19:"  // Height 2: setup done
+      "mov x28, #0x0\n"
+      "20:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 21f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x28, 22f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 22f\n"
+      "21:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "22:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "ble 25f\n"
+      "23:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z4.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z5.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "udot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "udot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "udot z19.s, z7.b, z0.b[0]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "udot z23.s, z7.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "udot z16.s, z8.b, z0.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "udot z20.s, z8.b, z1.b[1]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "udot z17.s, z9.b, z0.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "udot z21.s, z9.b, z1.b[1]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "udot z18.s, z10.b, z0.b[1]\n"
+      "udot z22.s, z10.b, z1.b[1]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "udot z19.s, z4.b, z0.b[1]\n"
+      "udot z23.s, z4.b, z1.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "udot z16.s, z5.b, z0.b[2]\n"
+      "udot z20.s, z5.b, z1.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "udot z17.s, z6.b, z0.b[2]\n"
+      "udot z21.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z7.b, z0.b[2]\n"
+      "udot z22.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z8.b, z0.b[2]\n"
+      "udot z23.s, z8.b, z1.b[2]\n"
+      "udot z16.s, z9.b, z0.b[3]\n"
+      "udot z20.s, z9.b, z1.b[3]\n"
+      "udot z17.s, z10.b, z0.b[3]\n"
+      "udot z21.s, z10.b, z1.b[3]\n"
+      "udot z18.s, z4.b, z0.b[3]\n"
+      "udot z22.s, z4.b, z1.b[3]\n"
+      "udot z19.s, z5.b, z0.b[3]\n"
+      "udot z23.s, z5.b, z1.b[3]\n"
+      "tbnz %x[flags], #31, 24f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z12.s, z1.b, z15.b\n"
+      "24:"  // Height 2: Multiply loop: unique 3: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x10\n"
+      "bgt 23b\n"
+      "25:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z7.b, z0.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x24, x24, #0x10\n"
+      "udot z20.s, z6.b, z1.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z21.s, z7.b, z1.b[0]\n"
+      "udot z18.s, z8.b, z0.b[0]\n"
+      "udot z22.s, z8.b, z1.b[0]\n"
+      "udot z19.s, z9.b, z0.b[0]\n"
+      "udot z23.s, z9.b, z1.b[0]\n"
+      "ble 26f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "udot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z20.s, z10.b, z1.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z17.s, z4.b, z0.b[1]\n"
+      "addvl x11, x11, #4\n"
+      "udot z21.s, z4.b, z1.b[1]\n"
+      "udot z18.s, z5.b, z0.b[1]\n"
+      "udot z22.s, z5.b, z1.b[1]\n"
+      "udot z19.s, z6.b, z0.b[1]\n"
+      "udot z23.s, z6.b, z1.b[1]\n"
+      "ble 26f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "udot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z20.s, z7.b, z1.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z17.s, z8.b, z0.b[2]\n"
+      "addvl x11, x11, #4\n"
+      "udot z21.s, z8.b, z1.b[2]\n"
+      "udot z18.s, z9.b, z0.b[2]\n"
+      "udot z22.s, z9.b, z1.b[2]\n"
+      "udot z19.s, z10.b, z0.b[2]\n"
+      "udot z23.s, z10.b, z1.b[2]\n"
+      "ble 26f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "udot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "udot z20.s, z4.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z17.s, z5.b, z0.b[3]\n"
+      "addvl x11, x11, #4\n"
+      "udot z21.s, z5.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z0.b[3]\n"
+      "udot z22.s, z6.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z0.b[3]\n"
+      "udot z23.s, z7.b, z1.b[3]\n"
+      "26:"  // Height 2: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 27f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z12.s, z1.b, z15.b\n"
+      "27:"  // Height 2: Multiply loop: unique 4: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 20b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "tbnz %x[flags], #31, 28f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z2.s }, p2/Z, [x19]\n"
+      "neg z2.s, p2/M, z2.s\n"
+      "mov x20, #0x4\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "uaddv d11, p0, z11.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "uaddv d12, p0, z12.s\n"
+      "mov z11.s, z11.s[0]\n"
+      "mov z12.s, z12.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z2.s\n"
+      "mul z12.s, p2/M, z12.s, z2.s\n"
+      "28:"  // Height 2: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z20.s, z20.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z21.s, z21.s, z12.s\n"
+      "add z22.s, z22.s, z12.s\n"
+      "add z23.s, z23.s, z12.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      "tbz %x[flags], #5, 29f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "and z8.d, z20.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "and z9.d, z21.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z20.s, z20.s, z8.s\n"
+      "sqadd z21.s, z21.s, z9.s\n"
+      "sqadd z22.s, z22.s, z10.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "29:"  // Height 2: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "30:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 18b\n"
+      "b 62f\n"
+      "31:"  // Height 3
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 32f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "add x25, x25, x19\n"
+      "add x23, x23, x19\n"
+      "b 33f\n"
+      "32:"  // Height 3: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "33:"  // Height 3: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "34:"  // Height 3: setup done
+      "mov x28, #0x0\n"
+      "35:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x28, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 37f\n"
+      "36:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "37:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "ble 40f\n"
+      "38:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z4.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z5.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      "udot z24.s, z4.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "udot z25.s, z5.b, z2.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "udot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "udot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "udot z26.s, z6.b, z2.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "udot z19.s, z7.b, z0.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "udot z23.s, z7.b, z1.b[0]\n"
+      "udot z27.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "udot z16.s, z8.b, z0.b[1]\n"
+      "udot z20.s, z8.b, z1.b[1]\n"
+      "udot z24.s, z8.b, z2.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "udot z17.s, z9.b, z0.b[1]\n"
+      "udot z21.s, z9.b, z1.b[1]\n"
+      "udot z25.s, z9.b, z2.b[1]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "udot z18.s, z10.b, z0.b[1]\n"
+      "udot z22.s, z10.b, z1.b[1]\n"
+      "udot z26.s, z10.b, z2.b[1]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "udot z19.s, z4.b, z0.b[1]\n"
+      "udot z23.s, z4.b, z1.b[1]\n"
+      "udot z27.s, z4.b, z2.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "udot z16.s, z5.b, z0.b[2]\n"
+      "udot z20.s, z5.b, z1.b[2]\n"
+      "udot z24.s, z5.b, z2.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "udot z17.s, z6.b, z0.b[2]\n"
+      "udot z21.s, z6.b, z1.b[2]\n"
+      "udot z25.s, z6.b, z2.b[2]\n"
+      "udot z18.s, z7.b, z0.b[2]\n"
+      "udot z22.s, z7.b, z1.b[2]\n"
+      "udot z26.s, z7.b, z2.b[2]\n"
+      "udot z19.s, z8.b, z0.b[2]\n"
+      "udot z23.s, z8.b, z1.b[2]\n"
+      "udot z27.s, z8.b, z2.b[2]\n"
+      "udot z16.s, z9.b, z0.b[3]\n"
+      "udot z20.s, z9.b, z1.b[3]\n"
+      "udot z24.s, z9.b, z2.b[3]\n"
+      "udot z17.s, z10.b, z0.b[3]\n"
+      "udot z21.s, z10.b, z1.b[3]\n"
+      "udot z25.s, z10.b, z2.b[3]\n"
+      "udot z18.s, z4.b, z0.b[3]\n"
+      "udot z22.s, z4.b, z1.b[3]\n"
+      "udot z26.s, z4.b, z2.b[3]\n"
+      "udot z19.s, z5.b, z0.b[3]\n"
+      "udot z23.s, z5.b, z1.b[3]\n"
+      "udot z27.s, z5.b, z2.b[3]\n"
+      "tbnz %x[flags], #31, 39f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z12.s, z1.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "39:"  // Height 3: Multiply loop: unique 5: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "bgt 38b\n"
+      "40:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z20.s, z6.b, z1.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      "udot z24.s, z6.b, z2.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z21.s, z7.b, z1.b[0]\n"
+      "udot z25.s, z7.b, z2.b[0]\n"
+      "udot z18.s, z8.b, z0.b[0]\n"
+      "udot z22.s, z8.b, z1.b[0]\n"
+      "udot z26.s, z8.b, z2.b[0]\n"
+      "udot z19.s, z9.b, z0.b[0]\n"
+      "udot z23.s, z9.b, z1.b[0]\n"
+      "udot z27.s, z9.b, z2.b[0]\n"
+      "ble 41f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "udot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z20.s, z10.b, z1.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z24.s, z10.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z17.s, z4.b, z0.b[1]\n"
+      "udot z21.s, z4.b, z1.b[1]\n"
+      "udot z25.s, z4.b, z2.b[1]\n"
+      "udot z18.s, z5.b, z0.b[1]\n"
+      "udot z22.s, z5.b, z1.b[1]\n"
+      "udot z26.s, z5.b, z2.b[1]\n"
+      "udot z19.s, z6.b, z0.b[1]\n"
+      "udot z23.s, z6.b, z1.b[1]\n"
+      "udot z27.s, z6.b, z2.b[1]\n"
+      "ble 41f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "udot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z20.s, z7.b, z1.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z24.s, z7.b, z2.b[2]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z17.s, z8.b, z0.b[2]\n"
+      "udot z21.s, z8.b, z1.b[2]\n"
+      "udot z25.s, z8.b, z2.b[2]\n"
+      "udot z18.s, z9.b, z0.b[2]\n"
+      "udot z22.s, z9.b, z1.b[2]\n"
+      "udot z26.s, z9.b, z2.b[2]\n"
+      "udot z19.s, z10.b, z0.b[2]\n"
+      "udot z23.s, z10.b, z1.b[2]\n"
+      "udot z27.s, z10.b, z2.b[2]\n"
+      "ble 41f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "udot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "udot z20.s, z4.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z24.s, z4.b, z2.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z17.s, z5.b, z0.b[3]\n"
+      "udot z21.s, z5.b, z1.b[3]\n"
+      "udot z25.s, z5.b, z2.b[3]\n"
+      "udot z18.s, z6.b, z0.b[3]\n"
+      "udot z22.s, z6.b, z1.b[3]\n"
+      "udot z26.s, z6.b, z2.b[3]\n"
+      "udot z19.s, z7.b, z0.b[3]\n"
+      "udot z23.s, z7.b, z1.b[3]\n"
+      "udot z27.s, z7.b, z2.b[3]\n"
+      "41:"  // Height 3: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 42f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z12.s, z1.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "42:"  // Height 3: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 35b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbnz %x[flags], #31, 43f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z3.s }, p2/Z, [x19]\n"
+      "neg z3.s, p2/M, z3.s\n"
+      "mov x20, #0x4\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "uaddv d11, p0, z11.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "uaddv d12, p0, z12.s\n"
+      "mov x19, #0x4\n"
+      "mov z11.s, z11.s[0]\n"
+      "whilelt p0.s, XZR, x19\n"
+      "mov z12.s, z12.s[0]\n"
+      "uaddv d13, p0, z13.s\n"
+      "mul z11.s, p2/M, z11.s, z3.s\n"
+      "mul z12.s, p2/M, z12.s, z3.s\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z13.s, p2/M, z13.s, z3.s\n"
+      "43:"  // Height 3: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z20.s, z20.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z21.s, z21.s, z12.s\n"
+      "add z22.s, z22.s, z12.s\n"
+      "add z23.s, z23.s, z12.s\n"
+      "add z24.s, z24.s, z13.s\n"
+      "add z25.s, z25.s, z13.s\n"
+      "add z26.s, z26.s, z13.s\n"
+      "add z27.s, z27.s, z13.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
+      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
+      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "tbz %x[flags], #5, 44f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "and z8.d, z20.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "and z9.d, z21.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z20.s, z20.s, z8.s\n"
+      "sqadd z21.s, z21.s, z9.s\n"
+      "sqadd z22.s, z22.s, z10.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "and z6.d, z25.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z5.s\n"
+      "and z7.d, z26.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z8.d, z27.d, z0.d\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z26.s, z26.s, z7.s\n"
+      "sqadd z27.s, z27.s, z8.s\n"
+      "44:"  // Height 3: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "add z26.s, z26.s, z4.s\n"
+      "addvl x25, x25, #1\n"
+      "add z27.s, z27.s, z4.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "addvl x23, x23, #1\n"
+      "45:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 33b\n"
+      "b 62f\n"
+      "46:"  // Height 4
+      "mov z11.s, #0x0\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x10, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.b, #0x1\n"
+      "tbz %x[flags], #2, 47f\n"
+      "ldr x9, [%x[output_ptr], #0x0]\n"
+      "ldr x25, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19\n"
+      "ldr x23, [%x[output_ptr], #0x10]\n"
+      "ldr x21, [%x[output_ptr], #0x18]\n"
+      "add x25, x25, x19\n"
+      "add %x[output_ptr], %x[output_ptr], #0x20\n"
+      "add x23, x23, x19\n"
+      "add x21, x21, x19\n"
+      "b 48f\n"
+      "47:"  // Height 4: setup direct output
+      "mov x9, %x[output_ptr]\n"
+      "add x25, x9, x19\n"
+      "add x23, x25, x19\n"
+      "add x21, x23, x19\n"
+      "add %x[output_ptr], x21, x19\n"
+      "48:"  // Height 4: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x12\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "49:"  // Height 4: setup done
+      "mov x28, #0x0\n"
+      "50:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w27, [x20, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 51f\n"
+      "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x26, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x28, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 52f\n"
+      "51:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "52:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "ble 55f\n"
+      "53:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z4.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z5.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "udot z24.s, z4.b, z2.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "udot z21.s, z5.b, z1.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "udot z25.s, z5.b, z2.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+      "udot z28.s, z4.b, z3.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+      "udot z29.s, z5.b, z3.b[0]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+      "udot z18.s, z6.b, z0.b[0]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+      "addvl x11, x11, #16\n"
+      "udot z22.s, z6.b, z1.b[0]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+      "udot z26.s, z6.b, z2.b[0]\n"
+      "udot z30.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+      "udot z19.s, z7.b, z0.b[0]\n"
+      "udot z23.s, z7.b, z1.b[0]\n"
+      "udot z27.s, z7.b, z2.b[0]\n"
+      "udot z31.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+      "udot z16.s, z8.b, z0.b[1]\n"
+      "udot z20.s, z8.b, z1.b[1]\n"
+      "udot z24.s, z8.b, z2.b[1]\n"
+      "udot z28.s, z8.b, z3.b[1]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+      "udot z17.s, z9.b, z0.b[1]\n"
+      "udot z21.s, z9.b, z1.b[1]\n"
+      "udot z25.s, z9.b, z2.b[1]\n"
+      "udot z29.s, z9.b, z3.b[1]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+      "udot z18.s, z10.b, z0.b[1]\n"
+      "udot z22.s, z10.b, z1.b[1]\n"
+      "udot z26.s, z10.b, z2.b[1]\n"
+      "udot z30.s, z10.b, z3.b[1]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+      "udot z19.s, z4.b, z0.b[1]\n"
+      "udot z23.s, z4.b, z1.b[1]\n"
+      "udot z27.s, z4.b, z2.b[1]\n"
+      "udot z31.s, z4.b, z3.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+      "udot z16.s, z5.b, z0.b[2]\n"
+      "udot z20.s, z5.b, z1.b[2]\n"
+      "udot z24.s, z5.b, z2.b[2]\n"
+      "udot z28.s, z5.b, z3.b[2]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+      "udot z17.s, z6.b, z0.b[2]\n"
+      "udot z21.s, z6.b, z1.b[2]\n"
+      "udot z25.s, z6.b, z2.b[2]\n"
+      "udot z29.s, z6.b, z3.b[2]\n"
+      "udot z18.s, z7.b, z0.b[2]\n"
+      "udot z22.s, z7.b, z1.b[2]\n"
+      "udot z26.s, z7.b, z2.b[2]\n"
+      "udot z30.s, z7.b, z3.b[2]\n"
+      "udot z19.s, z8.b, z0.b[2]\n"
+      "udot z23.s, z8.b, z1.b[2]\n"
+      "udot z27.s, z8.b, z2.b[2]\n"
+      "udot z31.s, z8.b, z3.b[2]\n"
+      "udot z16.s, z9.b, z0.b[3]\n"
+      "udot z20.s, z9.b, z1.b[3]\n"
+      "udot z24.s, z9.b, z2.b[3]\n"
+      "udot z28.s, z9.b, z3.b[3]\n"
+      "udot z17.s, z10.b, z0.b[3]\n"
+      "udot z21.s, z10.b, z1.b[3]\n"
+      "udot z25.s, z10.b, z2.b[3]\n"
+      "udot z29.s, z10.b, z3.b[3]\n"
+      "udot z18.s, z4.b, z0.b[3]\n"
+      "udot z22.s, z4.b, z1.b[3]\n"
+      "udot z26.s, z4.b, z2.b[3]\n"
+      "udot z30.s, z4.b, z3.b[3]\n"
+      "udot z19.s, z5.b, z0.b[3]\n"
+      "udot z23.s, z5.b, z1.b[3]\n"
+      "udot z27.s, z5.b, z2.b[3]\n"
+      "udot z31.s, z5.b, z3.b[3]\n"
+      "tbnz %x[flags], #31, 54f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z12.s, z1.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "udot z14.s, z3.b, z15.b\n"
+      "54:"  // Height 4: Multiply loop: unique 7: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "sub x27, x27, #0x10\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x27, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "bgt 53b\n"
+      "55:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p2/Z, [x11]\n"
+      "whilelt p0.b, XZR, x27\n"
+      "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x26]\n"
+      "udot z16.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z17.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z20.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "udot z24.s, z6.b, z2.b[0]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      "udot z21.s, z7.b, z1.b[0]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z28.s, z6.b, z3.b[0]\n"
+      "udot z25.s, z7.b, z2.b[0]\n"
+      "udot z29.s, z7.b, z3.b[0]\n"
+      "udot z18.s, z8.b, z0.b[0]\n"
+      "udot z22.s, z8.b, z1.b[0]\n"
+      "udot z26.s, z8.b, z2.b[0]\n"
+      "udot z30.s, z8.b, z3.b[0]\n"
+      "udot z19.s, z9.b, z0.b[0]\n"
+      "udot z23.s, z9.b, z1.b[0]\n"
+      "udot z27.s, z9.b, z2.b[0]\n"
+      "udot z31.s, z9.b, z3.b[0]\n"
+      "ble 56f\n"
+      "ld1b { z10.b }, p2/Z, [x11]\n"
+      "udot z16.s, z10.b, z0.b[1]\n"
+      "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z20.s, z10.b, z1.b[1]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z24.s, z10.b, z2.b[1]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z28.s, z10.b, z3.b[1]\n"
+      "udot z17.s, z4.b, z0.b[1]\n"
+      "udot z21.s, z4.b, z1.b[1]\n"
+      "udot z25.s, z4.b, z2.b[1]\n"
+      "udot z29.s, z4.b, z3.b[1]\n"
+      "udot z18.s, z5.b, z0.b[1]\n"
+      "udot z22.s, z5.b, z1.b[1]\n"
+      "udot z26.s, z5.b, z2.b[1]\n"
+      "udot z30.s, z5.b, z3.b[1]\n"
+      "udot z19.s, z6.b, z0.b[1]\n"
+      "udot z23.s, z6.b, z1.b[1]\n"
+      "udot z27.s, z6.b, z2.b[1]\n"
+      "udot z31.s, z6.b, z3.b[1]\n"
+      "ble 56f\n"
+      "ld1b { z7.b }, p2/Z, [x11]\n"
+      "udot z16.s, z7.b, z0.b[2]\n"
+      "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "subs x27, x27, #0x4\n"
+      "udot z20.s, z7.b, z1.b[2]\n"
+      "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z24.s, z7.b, z2.b[2]\n"
+      "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z28.s, z7.b, z3.b[2]\n"
+      "udot z17.s, z8.b, z0.b[2]\n"
+      "udot z21.s, z8.b, z1.b[2]\n"
+      "udot z25.s, z8.b, z2.b[2]\n"
+      "udot z29.s, z8.b, z3.b[2]\n"
+      "udot z18.s, z9.b, z0.b[2]\n"
+      "udot z22.s, z9.b, z1.b[2]\n"
+      "udot z26.s, z9.b, z2.b[2]\n"
+      "udot z30.s, z9.b, z3.b[2]\n"
+      "udot z19.s, z10.b, z0.b[2]\n"
+      "udot z23.s, z10.b, z1.b[2]\n"
+      "udot z27.s, z10.b, z2.b[2]\n"
+      "udot z31.s, z10.b, z3.b[2]\n"
+      "ble 56f\n"
+      "ld1b { z4.b }, p2/Z, [x11]\n"
+      "udot z16.s, z4.b, z0.b[3]\n"
+      "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+      "udot z20.s, z4.b, z1.b[3]\n"
+      "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+      "udot z24.s, z4.b, z2.b[3]\n"
+      "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "udot z28.s, z4.b, z3.b[3]\n"
+      "udot z17.s, z5.b, z0.b[3]\n"
+      "udot z21.s, z5.b, z1.b[3]\n"
+      "udot z25.s, z5.b, z2.b[3]\n"
+      "udot z29.s, z5.b, z3.b[3]\n"
+      "udot z18.s, z6.b, z0.b[3]\n"
+      "udot z22.s, z6.b, z1.b[3]\n"
+      "udot z26.s, z6.b, z2.b[3]\n"
+      "udot z30.s, z6.b, z3.b[3]\n"
+      "udot z19.s, z7.b, z0.b[3]\n"
+      "udot z23.s, z7.b, z1.b[3]\n"
+      "udot z27.s, z7.b, z2.b[3]\n"
+      "udot z31.s, z7.b, z3.b[3]\n"
+      "56:"  // Height 4: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 57f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z12.s, z1.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "udot z14.s, z3.b, z15.b\n"
+      "57:"  // Height 4: Multiply loop: unique 8: skip row sum
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "add x28, x28, #0x1\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x28, x19\n"
+      "bne 50b\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbnz %x[flags], #31, 58f\n"
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "neg z4.s, p2/M, z4.s\n"
+      "mov x20, #0x4\n"
+      "mov x19, #0x4\n"
+      "whilelt p0.s, XZR, x20\n"
+      "uaddv d11, p0, z11.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "uaddv d12, p0, z12.s\n"
+      "mov x19, #0x4\n"
+      "mov z11.s, z11.s[0]\n"
+      "whilelt p0.s, XZR, x19\n"
+      "mov x19, #0x4\n"
+      "mov z12.s, z12.s[0]\n"
+      "uaddv d13, p0, z13.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "mul z11.s, p2/M, z11.s, z4.s\n"
+      "uaddv d14, p0, z14.s\n"
+      "mul z12.s, p2/M, z12.s, z4.s\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z13.s, p2/M, z13.s, z4.s\n"
+      "mov z14.s, z14.s[0]\n"
+      "mul z14.s, p2/M, z14.s, z4.s\n"
+      "58:"  // Height 4: skip row sum fixup
+      "add z16.s, z16.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x10]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+      "add x20, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+      "add x19, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+      "addvl x10, x10, #4\n"
+      "add z20.s, z20.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      "add z21.s, z21.s, z12.s\n"
+      "add z22.s, z22.s, z12.s\n"
+      "add z23.s, z23.s, z12.s\n"
+      "add z24.s, z24.s, z13.s\n"
+      "add z25.s, z25.s, z13.s\n"
+      "add z26.s, z26.s, z13.s\n"
+      "add z27.s, z27.s, z13.s\n"
+      "add z28.s, z28.s, z14.s\n"
+      "add z29.s, z29.s, z14.s\n"
+      "add z30.s, z30.s, z14.s\n"
+      "add z31.s, z31.s, z14.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z20.s, z20.s, z0.s\n"
+      "add z21.s, z21.s, z1.s\n"
+      "add z22.s, z22.s, z2.s\n"
+      "add z23.s, z23.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      "add z28.s, z28.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x20]\n"
+      "add z29.s, z29.s, z1.s\n"
+      "add z30.s, z30.s, z2.s\n"
+      "add z31.s, z31.s, z3.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
+      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
+      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
+      ".inst 0x04a477bd  // sqrdmulh z29.s, z29.s, z4.s\n"
+      ".inst 0x04a477de  // sqrdmulh z30.s, z30.s, z4.s\n"
+      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      "tbz %x[flags], #5, 59f\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "and z7.d, z19.d, z0.d\n"
+      "and z8.d, z20.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "and z9.d, z21.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z20.s, z20.s, z8.s\n"
+      "sqadd z21.s, z21.s, z9.s\n"
+      "sqadd z22.s, z22.s, z10.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "and z6.d, z25.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z5.s\n"
+      "and z7.d, z26.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "and z8.d, z27.d, z0.d\n"
+      "and z9.d, z28.d, z0.d\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "and z10.d, z29.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "and z4.d, z30.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "sqadd z26.s, z26.s, z7.s\n"
+      "and z5.d, z31.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z27.s, z27.s, z8.s\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z9.s\n"
+      "sqadd z29.s, z29.s, z10.s\n"
+      "sqadd z30.s, z30.s, z4.s\n"
+      "sqadd z31.s, z31.s, z5.s\n"
+      "59:"  // Height 4: no shift correction
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add x19, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x19]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x19, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x19]\n"
+      "add x19, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x19]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x9]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "uzp1 z20.h, z20.h, z21.h\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z21.h, z22.h, z23.h\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z20.b, z20.b, z21.b\n"
+      "st1b { z20.b }, p1, [x25]\n"
+      "add z26.s, z26.s, z4.s\n"
+      "addvl x25, x25, #1\n"
+      "add z27.s, z27.s, z4.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "add z28.s, z28.s, z4.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "smin z28.s, p2/M, z28.s, z6.s\n"
+      ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
+      ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "smax z28.s, p2/M, z28.s, z5.s\n"
+      "add z29.s, z29.s, z4.s\n"
+      "add z30.s, z30.s, z4.s\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x23]\n"
+      "smin z29.s, p2/M, z29.s, z6.s\n"
+      "addvl x23, x23, #1\n"
+      "smin z30.s, p2/M, z30.s, z6.s\n"
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "smax z29.s, p2/M, z29.s, z5.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      "smax z30.s, p2/M, z30.s, z5.s\n"
+      "uzp1 z28.h, z28.h, z29.h\n"
+      "smin z31.s, p2/M, z31.s, z6.s\n"
+      "smax z31.s, p2/M, z31.s, z5.s\n"
+      "uzp1 z29.h, z30.h, z31.h\n"
+      "uzp1 z28.b, z28.b, z29.b\n"
+      "st1b { z28.b }, p1, [x21]\n"
+      "addvl x21, x21, #1\n"
+      "60:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x12, x12, x19\n"
+      "bgt 48b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 62f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "61:"  // Update direct input
+      "mov x19, #0x4\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "62:"  // Exit
+
+      : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
deleted file mode 100644
index 565832e8de..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2137 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
-    const int K_stride = ((K + 3) / 4) * 4;
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long leftovers = K;
-    const long blocks_count = (K + 3) / 4;
-
-    int rows_to_compute;
-
-    for (int y=0; y<M; y+=rows_to_compute) {
-        const uint8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(uint8_t);
-
-        uint32_t *c_ptr0 = C + (y * ldc);
-
-        rows_to_compute = M-y;
-        if (rows_to_compute > 4) {
-            if (rows_to_compute % 4) {
-                rows_to_compute = 4 - 1;
-            } else {
-                rows_to_compute = 4;
-            }
-        }
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            const uint8_t *a_ptr0 = a_ptr0_base;
-            const uint8_t *b_ptr0 = B + (K_stride * x0);
-            const unsigned long ldcb = ldc * sizeof(uint32_t);
-
-            switch(rows_to_compute) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z18.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z19.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "mov z18.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z19.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z20.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z21.s, #0\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "mov z18.s, #0\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "mov z19.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z20.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z21.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z24.s, #0\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z25.s, #0\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z26.s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z27.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[accumulate], 1f\n"
-                        "mov z16.s, #0\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "mov z17.s, #0\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "mov z18.s, #0\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "mov z19.s, #0\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                        "mov z20.s, #0\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "mov z21.s, #0\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "mov z24.s, #0\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "mov z25.s, #0\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "mov z26.s, #0\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "mov z27.s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z28.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "mov z29.s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "mov z30.s, #0\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z31.s, #0\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
-                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
-                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
-                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
-                        "ld1w z28.s, p0/z, [c_ptr3]\n"
-                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
-                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
-                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z28.s, z12.b, z3.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z30.s, z14.b, z3.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
-                        "udot z31.s, z15.b, z3.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "udot z28.s, z8.b, z7.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z30.s, z10.b, z7.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "udot z31.s, z11.b, z7.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z28.s, z12.b, z7.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z30.s, z14.b, z7.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "udot z31.s, z15.b, z7.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z28.s, z8.b, z7.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z30.s, z10.b, z7.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z31.s, z11.b, z7.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z28.s, z12.b, z7.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z29.s, z13.b, z7.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z30.s, z14.b, z7.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "udot z31.s, z15.b, z7.b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "cbz %[regs], 4f\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z28.s, z12.b, z3.b[3]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z30.s, z14.b, z3.b[3]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                        "udot z31.s, z15.b, z3.b[3]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "udot z28.s, z8.b, z7.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z30.s, z10.b, z7.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "udot z31.s, z11.b, z7.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z28.s, z12.b, z7.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z30.s, z14.b, z7.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "udot z31.s, z15.b, z7.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z28.s, z8.b, z7.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z30.s, z10.b, z7.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z31.s, z11.b, z7.b[2]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z28.s, z12.b, z7.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z29.s, z13.b, z7.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z30.s, z14.b, z7.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "udot z31.s, z15.b, z7.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z28.s, z12.b, z3.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z30.s, z14.b, z3.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "udot z31.s, z15.b, z3.b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z28.s, z12.b, z3.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z30.s, z14.b, z3.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "udot z31.s, z15.b, z3.b[3]\n"
-                        "cbz %[blocks], 5f\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "udot z28.s, z8.b, z7.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z30.s, z10.b, z7.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "udot z31.s, z11.b, z7.b[0]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z28.s, z12.b, z7.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z30.s, z14.b, z7.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "udot z31.s, z15.b, z7.b[1]\n"
-                        "b.eq 5f\n"
-                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
-                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
-                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
-                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z28.s, z8.b, z7.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z30.s, z10.b, z7.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z31.s, z11.b, z7.b[2]\n"
-                        "b.eq 5f\n"
-                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
-                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
-                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
-                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z28.s, z12.b, z7.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z29.s, z13.b, z7.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z30.s, z14.b, z7.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "udot z31.s, z15.b, z7.b[3]\n"
-                        "5:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z28.s, p0, [c_ptr3]\n"
-                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
new file mode 100644
index 0000000000..af9de4a6eb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST  \
+   unsigned int, const unsigned int *, \
+   IndirectInputArg<uint8_t>, \
+   size_t, size_t, \
+   const uint8_t *, \
+   IndirectOutputArg<uint32_t>, \
+   const uint32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_u8u32_dot_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_u8u32_dot_6x4VL
+{
+public:
+    typedef uint8_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<uint32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_u8u32_dot_6x4VL;
+
+    cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..fc8ce636dd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
@@ -0,0 +1,1904 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8u32_dot_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+    const uint32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 61f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 49f\n"
+      "beq 37f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 25f\n"
+      "beq 13f\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 2f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "b 3f\n"
+      "2:"  // Height 1: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "3:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 8f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "cmp x11, #0x10\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "ble 11f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "addvl x14, x14, #4\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "ble 11f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "addvl x14, x14, #4\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "ble 11f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 6b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "12:"  // Height 1: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 3b\n"
+      "b 74f\n"
+      "13:"  // Height 2
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 14f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "b 15f\n"
+      "14:"  // Height 2: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "15:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 16f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "b 17f\n"
+      "16:"  // Height 2: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "cbnz x12, 20f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "20:"  // Height 2: input setup done
+      "cmp x11, #0x10\n"
+      "ble 22f\n"
+      "21:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "bgt 21b\n"
+      "22:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "ble 23f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "ble 23f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "ble 23f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "23:"  // Height 2: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 18b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "24:"  // Height 2: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 15b\n"
+      "b 74f\n"
+      "25:"  // Height 3
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 26f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 27f\n"
+      "26:"  // Height 3: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "27:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 28f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "b 29f\n"
+      "28:"  // Height 3: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "29:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "30:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 31f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "cbnz x12, 32f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "b 32f\n"
+      "31:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "32:"  // Height 3: input setup done
+      "cmp x11, #0x10\n"
+      "ble 34f\n"
+      "33:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "cmp x11, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "bgt 33b\n"
+      "34:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "add x26, x26, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "ble 35f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "ble 35f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "ble 35f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "35:"  // Height 3: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 30b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "36:"  // Height 3: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 27b\n"
+      "b 74f\n"
+      "37:"  // Height 4
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 38f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 39f\n"
+      "38:"  // Height 4: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "39:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 40f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "b 41f\n"
+      "40:"  // Height 4: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "41:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "42:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 43f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x12, 44f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "b 44f\n"
+      "43:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "44:"  // Height 4: input setup done
+      "cmp x11, #0x10\n"
+      "ble 46f\n"
+      "45:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "udot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "udot z21.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z23.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "udot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "udot z21.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z22.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z23.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "udot z21.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z22.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z23.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "udot z20.s, z6.b, z3.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "udot z21.s, z7.b, z3.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z23.s, z7.b, z3.b[3]\n"
+      "bgt 45b\n"
+      "46:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "udot z20.s, z6.b, z3.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z21.s, z7.b, z3.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z23.s, z7.b, z3.b[0]\n"
+      "ble 47f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "udot z20.s, z6.b, z3.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "udot z21.s, z7.b, z3.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z22.s, z6.b, z3.b[1]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z23.s, z7.b, z3.b[1]\n"
+      "ble 47f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z20.s, z6.b, z3.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "udot z21.s, z7.b, z3.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z22.s, z6.b, z3.b[2]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z23.s, z7.b, z3.b[2]\n"
+      "ble 47f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "udot z20.s, z6.b, z3.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "udot z21.s, z7.b, z3.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z23.s, z7.b, z3.b[3]\n"
+      "47:"  // Height 4: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 42b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "48:"  // Height 4: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 39b\n"
+      "b 74f\n"
+      "49:"  // Height 5
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 50f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 51f\n"
+      "50:"  // Height 5: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "51:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 52f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "b 53f\n"
+      "52:"  // Height 5: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "53:"  // Height 5: setup done
+      "mov x12, #0x0\n"
+      "54:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 55f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "cbnz x12, 56f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "b 56f\n"
+      "55:"  // Height 5: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "56:"  // Height 5: input setup done
+      "cmp x11, #0x10\n"
+      "ble 58f\n"
+      "57:"  // Height 5: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "udot z20.s, z6.b, z3.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "udot z24.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "udot z21.s, z7.b, z3.b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "udot z25.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "udot z26.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z23.s, z7.b, z3.b[0]\n"
+      "udot z27.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "udot z20.s, z6.b, z3.b[1]\n"
+      "udot z24.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "udot z21.s, z7.b, z3.b[1]\n"
+      "udot z25.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z22.s, z6.b, z3.b[1]\n"
+      "udot z26.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z23.s, z7.b, z3.b[1]\n"
+      "udot z27.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z20.s, z6.b, z3.b[2]\n"
+      "udot z24.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "udot z21.s, z7.b, z3.b[2]\n"
+      "udot z25.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z22.s, z6.b, z3.b[2]\n"
+      "udot z26.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z23.s, z7.b, z3.b[2]\n"
+      "udot z27.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "udot z20.s, z6.b, z3.b[3]\n"
+      "udot z24.s, z6.b, z4.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "udot z21.s, z7.b, z3.b[3]\n"
+      "udot z25.s, z7.b, z4.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[3]\n"
+      "udot z26.s, z6.b, z4.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z23.s, z7.b, z3.b[3]\n"
+      "udot z27.s, z7.b, z4.b[3]\n"
+      "bgt 57b\n"
+      "58:"  // Height 5: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "add x22, x22, #0x10\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "udot z20.s, z6.b, z3.b[0]\n"
+      "udot z24.s, z6.b, z4.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z21.s, z7.b, z3.b[0]\n"
+      "udot z25.s, z7.b, z4.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "udot z26.s, z6.b, z4.b[0]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z23.s, z7.b, z3.b[0]\n"
+      "udot z27.s, z7.b, z4.b[0]\n"
+      "ble 59f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "udot z20.s, z6.b, z3.b[1]\n"
+      "udot z24.s, z6.b, z4.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "udot z21.s, z7.b, z3.b[1]\n"
+      "udot z25.s, z7.b, z4.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z22.s, z6.b, z3.b[1]\n"
+      "udot z26.s, z6.b, z4.b[1]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z23.s, z7.b, z3.b[1]\n"
+      "udot z27.s, z7.b, z4.b[1]\n"
+      "ble 59f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z20.s, z6.b, z3.b[2]\n"
+      "udot z24.s, z6.b, z4.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "udot z21.s, z7.b, z3.b[2]\n"
+      "udot z25.s, z7.b, z4.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z22.s, z6.b, z3.b[2]\n"
+      "udot z26.s, z6.b, z4.b[2]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z23.s, z7.b, z3.b[2]\n"
+      "udot z27.s, z7.b, z4.b[2]\n"
+      "ble 59f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "udot z20.s, z6.b, z3.b[3]\n"
+      "udot z24.s, z6.b, z4.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "udot z21.s, z7.b, z3.b[3]\n"
+      "udot z25.s, z7.b, z4.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[3]\n"
+      "udot z26.s, z6.b, z4.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z23.s, z7.b, z3.b[3]\n"
+      "udot z27.s, z7.b, z4.b[3]\n"
+      "59:"  // Height 5: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 54b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "60:"  // Height 5: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 51b\n"
+      "b 74f\n"
+      "61:"  // Height 6
+      "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "tbz %x[flags], #2, 62f\n"
+      "ldr x13, [%x[output_ptr], #0x0]\n"
+      "add x13, x13, x19, LSL #2\n"
+      "ldr x9, [%x[output_ptr], #0x8]\n"
+      "ldr x27, [%x[output_ptr], #0x10]\n"
+      "add x9, x9, x19, LSL #2\n"
+      "ldr x25, [%x[output_ptr], #0x18]\n"
+      "ldr x23, [%x[output_ptr], #0x20]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "ldr x21, [%x[output_ptr], #0x28]\n"
+      "add %x[output_ptr], %x[output_ptr], #0x30\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 63f\n"
+      "62:"  // Height 6: setup direct output
+      "mov x13, %x[output_ptr]\n"
+      "add x9, x13, x19, LSL #2\n"
+      "add x27, x9, x19, LSL #2\n"
+      "add x25, x27, x19, LSL #2\n"
+      "add x23, x25, x19, LSL #2\n"
+      "add x21, x23, x19, LSL #2\n"
+      "add %x[output_ptr], x21, x19, LSL #2\n"
+      "63:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x15\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x15\n"
+      "tbz %x[flags], #0, 64f\n"
+      "ld1w { z8.s }, p4/Z, [x13]\n"
+      "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x9]\n"
+      "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x27]\n"
+      "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x23]\n"
+      "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x21]\n"
+      "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "b 65f\n"
+      "64:"  // Height 6: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "65:"  // Height 6: setup done
+      "mov x12, #0x0\n"
+      "66:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x28, [x20, #0x8]\n"
+      "ldr x26, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x22, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x12, 68f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19\n"
+      "add x28, x28, x19\n"
+      "add x26, x26, x19\n"
+      "add x24, x24, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 68f\n"
+      "67:"  // Height 6: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x28, x10, x19\n"
+      "add x26, x28, x19\n"
+      "add x24, x26, x19\n"
+      "add x22, x24, x19\n"
+      "add x20, x22, x19\n"
+      "68:"  // Height 6: input setup done
+      "cmp x11, #0x10\n"
+      "ble 70f\n"
+      "69:"  // Height 6: Multiply loop: Main loop head
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "sub x11, x11, #0x10\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "udot z20.s, z6.b, z3.b[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "udot z24.s, z6.b, z4.b[0]\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "cmp x11, #0x10\n"
+      "udot z28.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "udot z21.s, z7.b, z3.b[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "udot z25.s, z7.b, z4.b[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "udot z29.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "udot z26.s, z6.b, z4.b[0]\n"
+      "udot z30.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z23.s, z7.b, z3.b[0]\n"
+      "udot z27.s, z7.b, z4.b[0]\n"
+      "udot z31.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "udot z20.s, z6.b, z3.b[1]\n"
+      "udot z24.s, z6.b, z4.b[1]\n"
+      "udot z28.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "udot z21.s, z7.b, z3.b[1]\n"
+      "udot z25.s, z7.b, z4.b[1]\n"
+      "udot z29.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+      "addvl x14, x14, #16\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z22.s, z6.b, z3.b[1]\n"
+      "udot z26.s, z6.b, z4.b[1]\n"
+      "udot z30.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z23.s, z7.b, z3.b[1]\n"
+      "udot z27.s, z7.b, z4.b[1]\n"
+      "udot z31.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z20.s, z6.b, z3.b[2]\n"
+      "udot z24.s, z6.b, z4.b[2]\n"
+      "udot z28.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "udot z21.s, z7.b, z3.b[2]\n"
+      "udot z25.s, z7.b, z4.b[2]\n"
+      "udot z29.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z22.s, z6.b, z3.b[2]\n"
+      "udot z26.s, z6.b, z4.b[2]\n"
+      "udot z30.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z23.s, z7.b, z3.b[2]\n"
+      "udot z27.s, z7.b, z4.b[2]\n"
+      "udot z31.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "udot z20.s, z6.b, z3.b[3]\n"
+      "udot z24.s, z6.b, z4.b[3]\n"
+      "udot z28.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "udot z21.s, z7.b, z3.b[3]\n"
+      "udot z25.s, z7.b, z4.b[3]\n"
+      "udot z29.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[3]\n"
+      "udot z26.s, z6.b, z4.b[3]\n"
+      "udot z30.s, z6.b, z5.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z23.s, z7.b, z3.b[3]\n"
+      "udot z27.s, z7.b, z4.b[3]\n"
+      "udot z31.s, z7.b, z5.b[3]\n"
+      "bgt 69b\n"
+      "70:"  // Height 6: Multiply loop: Single iteration only
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "whilelt p0.b, XZR, x11\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "ld1rqb { z0.b }, p0/Z, [x10]\n"
+      "udot z8.s, z6.b, z0.b[0]\n"
+      "ld1rqb { z1.b }, p0/Z, [x28]\n"
+      "add x10, x10, #0x10\n"
+      "udot z9.s, z7.b, z0.b[0]\n"
+      "ld1rqb { z2.b }, p0/Z, [x26]\n"
+      "add x28, x28, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
+      "ld1rqb { z3.b }, p0/Z, [x24]\n"
+      "add x26, x26, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
+      "ld1rqb { z5.b }, p0/Z, [x20]\n"
+      "add x22, x22, #0x10\n"
+      "udot z20.s, z6.b, z3.b[0]\n"
+      "add x20, x20, #0x10\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
+      "udot z24.s, z6.b, z4.b[0]\n"
+      "udot z28.s, z6.b, z5.b[0]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z21.s, z7.b, z3.b[0]\n"
+      "udot z25.s, z7.b, z4.b[0]\n"
+      "udot z29.s, z7.b, z5.b[0]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z14.s, z6.b, z1.b[0]\n"
+      "udot z18.s, z6.b, z2.b[0]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "udot z26.s, z6.b, z4.b[0]\n"
+      "udot z30.s, z6.b, z5.b[0]\n"
+      "udot z11.s, z7.b, z0.b[0]\n"
+      "udot z15.s, z7.b, z1.b[0]\n"
+      "udot z19.s, z7.b, z2.b[0]\n"
+      "udot z23.s, z7.b, z3.b[0]\n"
+      "udot z27.s, z7.b, z4.b[0]\n"
+      "udot z31.s, z7.b, z5.b[0]\n"
+      "ble 71f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[1]\n"
+      "udot z16.s, z6.b, z2.b[1]\n"
+      "udot z20.s, z6.b, z3.b[1]\n"
+      "udot z24.s, z6.b, z4.b[1]\n"
+      "udot z28.s, z6.b, z5.b[1]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[1]\n"
+      "udot z13.s, z7.b, z1.b[1]\n"
+      "udot z17.s, z7.b, z2.b[1]\n"
+      "udot z21.s, z7.b, z3.b[1]\n"
+      "udot z25.s, z7.b, z4.b[1]\n"
+      "udot z29.s, z7.b, z5.b[1]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[1]\n"
+      "udot z14.s, z6.b, z1.b[1]\n"
+      "udot z18.s, z6.b, z2.b[1]\n"
+      "udot z22.s, z6.b, z3.b[1]\n"
+      "udot z26.s, z6.b, z4.b[1]\n"
+      "udot z30.s, z6.b, z5.b[1]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z15.s, z7.b, z1.b[1]\n"
+      "udot z19.s, z7.b, z2.b[1]\n"
+      "udot z23.s, z7.b, z3.b[1]\n"
+      "udot z27.s, z7.b, z4.b[1]\n"
+      "udot z31.s, z7.b, z5.b[1]\n"
+      "ble 71f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "subs x11, x11, #0x4\n"
+      "udot z12.s, z6.b, z1.b[2]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z20.s, z6.b, z3.b[2]\n"
+      "udot z24.s, z6.b, z4.b[2]\n"
+      "udot z28.s, z6.b, z5.b[2]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[2]\n"
+      "udot z13.s, z7.b, z1.b[2]\n"
+      "udot z17.s, z7.b, z2.b[2]\n"
+      "udot z21.s, z7.b, z3.b[2]\n"
+      "udot z25.s, z7.b, z4.b[2]\n"
+      "udot z29.s, z7.b, z5.b[2]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[2]\n"
+      "udot z14.s, z6.b, z1.b[2]\n"
+      "udot z18.s, z6.b, z2.b[2]\n"
+      "udot z22.s, z6.b, z3.b[2]\n"
+      "udot z26.s, z6.b, z4.b[2]\n"
+      "udot z30.s, z6.b, z5.b[2]\n"
+      "udot z11.s, z7.b, z0.b[2]\n"
+      "udot z15.s, z7.b, z1.b[2]\n"
+      "udot z19.s, z7.b, z2.b[2]\n"
+      "udot z23.s, z7.b, z3.b[2]\n"
+      "udot z27.s, z7.b, z4.b[2]\n"
+      "udot z31.s, z7.b, z5.b[2]\n"
+      "ble 71f\n"
+      "ld1b { z6.b }, p5/Z, [x14]\n"
+      "udot z8.s, z6.b, z0.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+      "udot z12.s, z6.b, z1.b[3]\n"
+      "udot z16.s, z6.b, z2.b[3]\n"
+      "udot z20.s, z6.b, z3.b[3]\n"
+      "udot z24.s, z6.b, z4.b[3]\n"
+      "udot z28.s, z6.b, z5.b[3]\n"
+      "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b[3]\n"
+      "udot z13.s, z7.b, z1.b[3]\n"
+      "udot z17.s, z7.b, z2.b[3]\n"
+      "udot z21.s, z7.b, z3.b[3]\n"
+      "udot z25.s, z7.b, z4.b[3]\n"
+      "udot z29.s, z7.b, z5.b[3]\n"
+      "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+      "addvl x14, x14, #4\n"
+      "udot z10.s, z6.b, z0.b[3]\n"
+      "udot z14.s, z6.b, z1.b[3]\n"
+      "udot z18.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[3]\n"
+      "udot z26.s, z6.b, z4.b[3]\n"
+      "udot z30.s, z6.b, z5.b[3]\n"
+      "udot z11.s, z7.b, z0.b[3]\n"
+      "udot z15.s, z7.b, z1.b[3]\n"
+      "udot z19.s, z7.b, z2.b[3]\n"
+      "udot z23.s, z7.b, z3.b[3]\n"
+      "udot z27.s, z7.b, z4.b[3]\n"
+      "udot z31.s, z7.b, z5.b[3]\n"
+      "71:"  // Height 6: Multiply loop: multiply skip
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "add x12, x12, #0x1\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "cmp x12, x19\n"
+      "bne 66b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "prfm pstl1keep, [x27, #0x0]\n"
+      "prfm pstl1keep, [x25, #0x0]\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "st1w { z8.s }, p4, [x13]\n"
+      "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "st1w { z12.s }, p4, [x9]\n"
+      "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+      "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+      "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "st1w { z16.s }, p4, [x27]\n"
+      "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "st1w { z20.s }, p4, [x25]\n"
+      "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+      "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+      "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "st1w { z24.s }, p4, [x23]\n"
+      "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "st1w { z28.s }, p4, [x21]\n"
+      "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+      "addvl x21, x21, #4\n"
+      "72:"  // Height 6: Writeback done
+      "mov x19, #0x0\n"
+      "incw x19, ALL, MUL #4\n"
+      "subs x15, x15, x19\n"
+      "bgt 63b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 74f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 73f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "73:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "74:"  // Exit
+
+      : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
similarity index 89%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
index 43107e45fa..12bb758b68 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int);
 
-class interleaved_bf16fp32_dot_3VLx8 {
+class cls_sve_interleaved_bf16fp32_dot_8x3VL {
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
@@ -59,9 +59,9 @@ class interleaved_bf16fp32_dot_3VLx8 {
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 3, 2, 1> transforms = {};
 
-    kern_type kernel=sve_interleaved_bf16fp32_dot_3VLx8;
+    kern_type kernel=sve_interleaved_bf16fp32_dot_8x3VL;
 
-    interleaved_bf16fp32_dot_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_bf16fp32_dot_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
index 7e20ed0971..adee900337 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const bfloat16 *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
similarity index 89%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index f1353e2086..2889dd7f0f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int);
 
-class interleaved_bf16fp32_mmla_3VLx8 {
+class cls_sve_interleaved_bf16fp32_mmla_8x3VL {
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
@@ -59,9 +59,9 @@ class interleaved_bf16fp32_mmla_3VLx8 {
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2> transforms = {};
 
-    kern_type kernel=sve_interleaved_bf16fp32_mmla_3VLx8;
+    kern_type kernel=sve_interleaved_bf16fp32_mmla_8x3VL;
 
-    interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_bf16fp32_mmla_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
index 16cc69b2a6..e43404e608 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const bfloat16 *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
similarity index 90%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
index 816c0cd095..eb946d9dfa 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_fp16_mla_3VLx8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void sve_interleaved_fp16_mla_8x3VL(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
 
-class interleaved_fp16_mla_3VLx8 {
+class cls_sve_interleaved_fp16_mla_8x3VL {
 public:
     typedef __fp16 operand_type;
     typedef __fp16 result_type;
@@ -59,9 +59,9 @@ class interleaved_fp16_mla_3VLx8 {
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
 
-    kern_type kernel=sve_interleaved_fp16_mla_3VLx8;
+    kern_type kernel=sve_interleaved_fp16_mla_8x3VL;
 
-    interleaved_fp16_mla_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_fp16_mla_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
index f2050cbd56..46b8770409 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_fp16_mla_8x3VL(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
     const __fp16 *a_ptr = Apanel;
     __fp16 *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
similarity index 89%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
index cce90fb135..b84ba83b6a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_fp32_mla_3VLx8(const float *, const float *, float *, int, int, int);
+void sve_interleaved_fp32_mla_8x3VL(const float *, const float *, float *, int, int, int);
 
-class interleaved_fp32_mla_3VLx8 {
+class cls_sve_interleaved_fp32_mla_8x3VL {
 public:
     typedef float operand_type;
     typedef float result_type;
@@ -59,9 +59,9 @@ class interleaved_fp32_mla_3VLx8 {
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
 
-    kern_type kernel=sve_interleaved_fp32_mla_3VLx8;
+    kern_type kernel=sve_interleaved_fp32_mla_8x3VL;
 
-    interleaved_fp32_mla_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_fp32_mla_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
index cd178c478a..1e05a308b5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_fp32_mla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
similarity index 89%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
index 4ca43cd5c9..96216960ff 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_fp32_mmla_3VLx8(const float *, const float *, float *, int, int, int);
+void sve_interleaved_fp32_mmla_8x3VL(const float *, const float *, float *, int, int, int);
 
-class interleaved_fp32_mmla_3VLx8 {
+class cls_sve_interleaved_fp32_mmla_8x3VL {
 public:
     typedef float operand_type;
     typedef float result_type;
@@ -59,9 +59,9 @@ class interleaved_fp32_mmla_3VLx8 {
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 6, 2, 2> transforms = {};
 
-    kern_type kernel=sve_interleaved_fp32_mmla_3VLx8;
+    kern_type kernel=sve_interleaved_fp32_mmla_8x3VL;
 
-    interleaved_fp32_mmla_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_fp32_mmla_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
index a404ae9c82..39daf0ff20 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_fp32_mmla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_fp32_mmla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
     float *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
similarity index 86%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index e40ba215b4..3e16915cd4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_s8s32_dot_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void sve_interleaved_s8s32_dot_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
-class interleaved_s8s32_dot_3VLx8 {
+class cls_sve_interleaved_s8s32_dot_8x3VL {
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
@@ -58,10 +58,11 @@ class interleaved_s8s32_dot_3VLx8 {
 
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
 
-    kern_type kernel=sve_interleaved_s8s32_dot_3VLx8;
+    kern_type kernel=sve_interleaved_s8s32_dot_8x3VL;
 
-    interleaved_s8s32_dot_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
index cdc70705c5..674c2400bf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_s8s32_dot_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
     const int8_t *a_ptr = Apanel;
     int32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
similarity index 86%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index 361598d594..02b3451c54 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
-class interleaved_s8s32_mmla_3VLx8 {
+class cls_sve_interleaved_s8s32_mmla_8x3VL {
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
@@ -58,10 +58,11 @@ class interleaved_s8s32_mmla_3VLx8 {
 
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
 
-    kern_type kernel=sve_interleaved_s8s32_mmla_3VLx8;
+    kern_type kernel=sve_interleaved_s8s32_mmla_8x3VL;
 
-    interleaved_s8s32_mmla_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_s8s32_mmla_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
index cde9ec32e9..578aa01732 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
     const int8_t *a_ptr = Apanel;
     int32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
similarity index 86%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index 252f38ec63..832a224199 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
-class interleaved_u8u32_dot_3VLx8 {
+class cls_sve_interleaved_u8u32_dot_8x3VL {
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
@@ -58,10 +58,11 @@ class interleaved_u8u32_dot_3VLx8 {
 
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
 
-    kern_type kernel=sve_interleaved_u8u32_dot_3VLx8;
+    kern_type kernel=sve_interleaved_u8u32_dot_8x3VL;
 
-    interleaved_u8u32_dot_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
index 6626f8463b..891869c767 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
     const uint8_t *a_ptr = Apanel;
     uint32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
similarity index 86%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index ed44a9d8fc..4fdaab84bd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
 namespace arm_gemm {
 
 // Actual kernel implementations
-void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
-class interleaved_u8u32_mmla_3VLx8 {
+class cls_sve_interleaved_u8u32_mmla_8x3VL {
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
@@ -58,10 +58,11 @@ class interleaved_u8u32_mmla_3VLx8 {
 
     // Use the standard fixed size transforms.
     StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
 
-    kern_type kernel=sve_interleaved_u8u32_mmla_3VLx8;
+    kern_type kernel=sve_interleaved_u8u32_mmla_8x3VL;
 
-    interleaved_u8u32_mmla_3VLx8(const CPUInfo *)
+    cls_sve_interleaved_u8u32_mmla_8x3VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
index 81a1dbcf51..fa08a9d091 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
 
 namespace arm_gemm {
 
-void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
     const uint8_t *a_ptr = Apanel;
     uint32_t *c_ptr = Cpanel;
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp
similarity index 91%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp
index b555066195..2097d76a54 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_smallK_hybrid_fp32_mla_1VLx8(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_smallK_hybrid_fp32_mla_8x1VL(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
 
-class smallK_hybrid_fp32_mla_1VLx8
+class cls_sve_smallK_hybrid_fp32_mla_8x1VL
 {
 public:
     typedef float operand_type;
@@ -75,9 +75,9 @@ class smallK_hybrid_fp32_mla_1VLx8
     StdTransformsSVE<operand_type, result_type, 8, 1, 1> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx8;
+    kern_type kernel=sve_smallK_hybrid_fp32_mla_8x1VL;
 
-    smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *)
+    cls_sve_smallK_hybrid_fp32_mla_8x1VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
similarity index 99%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
index 5501688054..e07cfa8218 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void sve_smallK_hybrid_fp32_mla_1VLx8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
+void sve_smallK_hybrid_fp32_mla_8x1VL(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
     const long loops_count = iceildiv(N, (int)get_vector_length<float>()) - 1;
     const long ldab = lda * sizeof(float);
     const long ldcb = ldc * sizeof(float);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp
similarity index 91%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp
index eef1e4cc65..e50c05ba39 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
 
-class smallK_hybrid_s8s32_dot_1VLx8
+class cls_sve_smallK_hybrid_s8s32_dot_8x1VL
 {
 public:
     typedef int8_t operand_type;
@@ -75,9 +75,9 @@ class smallK_hybrid_s8s32_dot_1VLx8
     StdTransformsSVE<operand_type, result_type, 8, 1, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_smallK_hybrid_s8s32_dot_1VLx8;
+    kern_type kernel=sve_smallK_hybrid_s8s32_dot_8x1VL;
 
-    smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *)
+    cls_sve_smallK_hybrid_s8s32_dot_8x1VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
similarity index 82%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
index e2fbdcb61b..5770076d04 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool) {
     const long loops_count = iceildiv(N, (int)get_vector_length<int32_t>()) - 1;
     const long ldab = lda * sizeof(int8_t);
     const long ldcb = ldc * sizeof(int32_t);
@@ -112,55 +112,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
                     "mov z27.s, #0\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "cbz %[loops], 2f\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
@@ -186,10 +185,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -201,6 +199,8 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
@@ -230,23 +230,34 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -325,112 +336,112 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "mov z28.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
@@ -470,23 +481,42 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -565,48 +595,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -618,49 +650,46 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "mov z28.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
@@ -676,7 +705,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -690,11 +718,12 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
@@ -720,8 +749,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
@@ -737,23 +767,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -832,50 +889,52 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
@@ -894,50 +953,47 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z19.b, z5.b[3]\n"
                     "sdot z30.s, z19.b, z6.b[3]\n"
                     "sdot z31.s, z19.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "mov z28.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
@@ -953,7 +1009,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -962,7 +1017,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
                     "sdot z26.s, z19.b, z2.b[3]\n"
@@ -976,14 +1030,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
@@ -1006,8 +1062,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
@@ -1031,23 +1088,58 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z19.b, z5.b[3]\n"
                     "sdot z30.s, z19.b, z6.b[3]\n"
                     "sdot z31.s, z19.b, z7.b[3]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1126,46 +1218,48 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
@@ -1205,84 +1299,79 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -1300,7 +1389,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -1313,39 +1401,41 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
@@ -1354,6 +1444,8 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -1392,23 +1484,82 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1487,48 +1638,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -1575,85 +1728,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -1671,7 +1819,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -1679,7 +1826,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -1693,47 +1839,52 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -1780,23 +1931,90 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1875,48 +2093,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
@@ -1972,86 +2192,81 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -2069,7 +2284,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -2077,7 +2291,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -2086,7 +2299,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -2100,47 +2312,135 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "b 5f\n"
+                    "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -2195,23 +2495,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2290,49 +2583,51 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
@@ -2396,87 +2691,82 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z23.b, z5.b[3]\n"
                     "sdot z30.s, z23.b, z6.b[3]\n"
                     "sdot z31.s, z23.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -2494,7 +2784,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -2502,7 +2791,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -2511,7 +2799,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -2520,7 +2807,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
                     "sdot z26.s, z23.b, z2.b[3]\n"
@@ -2534,47 +2820,144 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "b 5f\n"
+                    "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -2637,23 +3020,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z23.b, z5.b[3]\n"
                     "sdot z30.s, z23.b, z6.b[3]\n"
                     "sdot z31.s, z23.b, z7.b[3]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2732,54 +3108,56 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
@@ -2856,88 +3234,84 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -2955,7 +3329,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -2963,7 +3336,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -2972,7 +3344,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -2981,7 +3352,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
@@ -2999,7 +3369,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -3007,55 +3376,62 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -3133,23 +3509,124 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3228,52 +3705,998 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "b 5f\n"
+                    "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "5:\n"
+                    "st1w z24.s, p0, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p0, [c_ptr1]\n"
+                    "st1w z26.s, p0, [c_ptr2]\n"
+                    "st1w z27.s, p0, [c_ptr3]\n"
+                    "st1w z28.s, p0, [c_ptr4]\n"
+                    "st1w z29.s, p0, [c_ptr5]\n"
+                    "st1w z30.s, p0, [c_ptr6]\n"
+                    "st1w z31.s, p0, [c_ptr7]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq a_ptr4\n"
+                    ".unreq a_ptr5\n"
+                    ".unreq a_ptr6\n"
+                    ".unreq a_ptr7\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    ".unreq c_ptr4\n"
+                    ".unreq c_ptr5\n"
+                    ".unreq c_ptr6\n"
+                    ".unreq c_ptr7\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
+                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 11:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "a_ptr4 .req X3\n"
+                    "a_ptr5 .req X4\n"
+                    "a_ptr6 .req X5\n"
+                    "a_ptr7 .req X6\n"
+                    "c_ptr1 .req X7\n"
+                    "c_ptr2 .req X8\n"
+                    "c_ptr3 .req X9\n"
+                    "c_ptr4 .req X10\n"
+                    "c_ptr5 .req X11\n"
+                    "c_ptr6 .req X12\n"
+                    "c_ptr7 .req X13\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "add a_ptr4, a_ptr3, %[lda]\n"
+                    "add c_ptr4, c_ptr3, %[ldc]\n"
+                    "add a_ptr5, a_ptr4, %[lda]\n"
+                    "add c_ptr5, c_ptr4, %[ldc]\n"
+                    "add a_ptr6, a_ptr5, %[lda]\n"
+                    "add c_ptr6, c_ptr5, %[ldc]\n"
+                    "add a_ptr7, a_ptr6, %[lda]\n"
+                    "add c_ptr7, c_ptr6, %[ldc]\n"
+                    "cbz %[oob_rows], 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr7, %[c_ptr0], #0x0\n"
+                    "add a_ptr7, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr6, %[c_ptr0], #0x0\n"
+                    "add a_ptr6, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr5, %[c_ptr0], #0x0\n"
+                    "add a_ptr5, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr4, %[c_ptr0], #0x0\n"
+                    "add a_ptr4, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr3, %[c_ptr0], #0x0\n"
+                    "add a_ptr3, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr2, %[c_ptr0], #0x0\n"
+                    "add a_ptr2, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr1, %[c_ptr0], #0x0\n"
+                    "add a_ptr1, %[a_ptr0], #0x0\n"
+                    "1:\n"
+                    "ptrue p7.b\n"
+                    "whilelt p6.b, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -3281,7 +4704,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
@@ -3289,6 +4711,7 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -3306,6 +4729,7 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -3361,84 +4785,55 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "b 5f\n"
+                    "2:\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
@@ -3460,7 +4855,7 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -3468,7 +4863,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -3477,7 +4871,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -3486,7 +4879,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
@@ -3504,7 +4896,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -3512,7 +4903,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -3521,53 +4911,149 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "5:\n"
+                    "st1w z24.s, p0, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p0, [c_ptr1]\n"
+                    "st1w z26.s, p0, [c_ptr2]\n"
+                    "st1w z27.s, p0, [c_ptr3]\n"
+                    "st1w z28.s, p0, [c_ptr4]\n"
+                    "st1w z29.s, p0, [c_ptr5]\n"
+                    "st1w z30.s, p0, [c_ptr6]\n"
+                    "st1w z31.s, p0, [c_ptr7]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq a_ptr4\n"
+                    ".unreq a_ptr5\n"
+                    ".unreq a_ptr6\n"
+                    ".unreq a_ptr7\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    ".unreq c_ptr4\n"
+                    ".unreq c_ptr5\n"
+                    ".unreq c_ptr6\n"
+                    ".unreq c_ptr7\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
+                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 12:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "a_ptr4 .req X3\n"
+                    "a_ptr5 .req X4\n"
+                    "a_ptr6 .req X5\n"
+                    "a_ptr7 .req X6\n"
+                    "c_ptr1 .req X7\n"
+                    "c_ptr2 .req X8\n"
+                    "c_ptr3 .req X9\n"
+                    "c_ptr4 .req X10\n"
+                    "c_ptr5 .req X11\n"
+                    "c_ptr6 .req X12\n"
+                    "c_ptr7 .req X13\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "add a_ptr4, a_ptr3, %[lda]\n"
+                    "add c_ptr4, c_ptr3, %[ldc]\n"
+                    "add a_ptr5, a_ptr4, %[lda]\n"
+                    "add c_ptr5, c_ptr4, %[ldc]\n"
+                    "add a_ptr6, a_ptr5, %[lda]\n"
+                    "add c_ptr6, c_ptr5, %[ldc]\n"
+                    "add a_ptr7, a_ptr6, %[lda]\n"
+                    "add c_ptr7, c_ptr6, %[ldc]\n"
+                    "cbz %[oob_rows], 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr7, %[c_ptr0], #0x0\n"
+                    "add a_ptr7, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr6, %[c_ptr0], #0x0\n"
+                    "add a_ptr6, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr5, %[c_ptr0], #0x0\n"
+                    "add a_ptr5, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr4, %[c_ptr0], #0x0\n"
+                    "add a_ptr4, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr3, %[c_ptr0], #0x0\n"
+                    "add a_ptr3, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr2, %[c_ptr0], #0x0\n"
+                    "add a_ptr2, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr1, %[c_ptr0], #0x0\n"
+                    "add a_ptr1, %[a_ptr0], #0x0\n"
+                    "1:\n"
+                    "ptrue p7.b\n"
+                    "whilelt p6.b, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -3576,7 +5062,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
@@ -3584,6 +5069,7 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -3601,7 +5087,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
                     "sdot z28.s, z20.b, z4.b[0]\n"
@@ -3656,149 +5144,88 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "2:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 11:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -3829,8 +5256,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
                     "sdot z28.s, z20.b, z4.b[0]\n"
@@ -3893,82 +5321,79 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
@@ -3991,16 +5416,15 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z28.s, z20.b, z4.b[0]\n"
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -4009,7 +5433,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -4018,7 +5441,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
@@ -4036,7 +5458,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -4044,7 +5465,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -4053,7 +5473,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -4062,53 +5481,43 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "b 5f\n"
+                    "2:\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -4142,8 +5551,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
                     "sdot z28.s, z20.b, z4.b[0]\n"
@@ -4206,23 +5616,24 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "2:\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -4242,7 +5653,7 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                 );
                 break;
-            case 12:
+            case 13:
                 __asm __volatile (
                     "a_ptr1 .req X0\n"
                     "a_ptr2 .req X1\n"
@@ -4301,54 +5712,242 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "ptrue p7.b\n"
+                    "whilelt p6.b, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -4381,14 +5980,15 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z24.s, z20.b, z0.b[0]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
                     "sdot z28.s, z20.b, z4.b[0]\n"
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
                     "sdot z27.s, z21.b, z3.b[1]\n"
@@ -4405,21 +6005,21 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
                     "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
                     "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
                     "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
                     "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
                     "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
                     "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
@@ -4445,89 +6045,94 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
                     "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
                     "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
                     "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
                     "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
                     "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
                     "sdot z31.s, z19.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
@@ -4552,7 +6157,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z24.s, z20.b, z0.b[0]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
                     "sdot z28.s, z20.b, z4.b[0]\n"
@@ -4561,6 +6165,7 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z20.b, z7.b[0]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
                     "sdot z27.s, z21.b, z3.b[1]\n"
@@ -4568,7 +6173,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -4577,25 +6181,23 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
                     "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
                     "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
                     "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
                     "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
                     "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
                     "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -4603,7 +6205,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -4612,7 +6213,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -4621,62 +6221,59 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
                     "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
                     "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
                     "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
                     "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
                     "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
                     "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "b 5f\n"
+                    "2:\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -4712,14 +6309,15 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z24.s, z20.b, z0.b[0]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
                     "sdot z28.s, z20.b, z4.b[0]\n"
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
                     "sdot z27.s, z21.b, z3.b[1]\n"
@@ -4736,21 +6334,21 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
                     "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
                     "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
                     "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
                     "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
                     "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
                     "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
@@ -4776,30 +6374,39 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
                     "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
                     "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
                     "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
                     "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
                     "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
                     "sdot z31.s, z19.b, z7.b[3]\n"
-                    "2:\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -4819,7 +6426,7 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                 );
                 break;
-            case 13:
+            case 14:
                 __asm __volatile (
                     "a_ptr1 .req X0\n"
                     "a_ptr2 .req X1\n"
@@ -4878,250 +6485,64 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
-                    "sdot z24.s, z22.b, z0.b[2]\n"
-                    "sdot z25.s, z22.b, z1.b[2]\n"
-                    "sdot z26.s, z22.b, z2.b[2]\n"
-                    "sdot z27.s, z22.b, z3.b[2]\n"
-                    "sdot z28.s, z22.b, z4.b[2]\n"
-                    "sdot z29.s, z22.b, z5.b[2]\n"
-                    "sdot z30.s, z22.b, z6.b[2]\n"
-                    "sdot z31.s, z22.b, z7.b[2]\n"
-                    "sdot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "sdot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "sdot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "sdot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "sdot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "sdot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "sdot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "sdot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "sdot z26.s, z17.b, z2.b[1]\n"
-                    "sdot z27.s, z17.b, z3.b[1]\n"
-                    "sdot z28.s, z17.b, z4.b[1]\n"
-                    "sdot z29.s, z17.b, z5.b[1]\n"
-                    "sdot z30.s, z17.b, z6.b[1]\n"
-                    "sdot z31.s, z17.b, z7.b[1]\n"
-                    "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "cbz %[loops], 2f\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
@@ -5154,7 +6575,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z20.b, z7.b[0]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
                     "sdot z27.s, z21.b, z3.b[1]\n"
@@ -5164,6 +6584,7 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z21.b, z7.b[1]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
                     "sdot z27.s, z22.b, z3.b[2]\n"
@@ -5171,7 +6592,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
@@ -5189,7 +6609,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -5197,7 +6616,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -5206,7 +6624,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -5215,7 +6632,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -5233,7 +6649,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -5241,56 +6656,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -5331,7 +6770,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z20.b, z7.b[0]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
                     "sdot z27.s, z21.b, z3.b[1]\n"
@@ -5339,7 +6777,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
                     "sdot z27.s, z22.b, z3.b[2]\n"
@@ -5380,178 +6820,100 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "sdot z25.s, z18.b, z1.b[2]\n"
-                    "sdot z26.s, z18.b, z2.b[2]\n"
-                    "sdot z27.s, z18.b, z3.b[2]\n"
-                    "sdot z28.s, z18.b, z4.b[2]\n"
-                    "sdot z29.s, z18.b, z5.b[2]\n"
-                    "sdot z30.s, z18.b, z6.b[2]\n"
-                    "sdot z31.s, z18.b, z7.b[2]\n"
-                    "sdot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "sdot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "sdot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "sdot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "sdot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "sdot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "sdot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "sdot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "sdot z24.s, z20.b, z0.b[0]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
-                    "2:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 14:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -5683,82 +7045,46 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "b 5f\n"
+                    "2:\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
@@ -5808,7 +7134,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
@@ -5826,7 +7151,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -5834,7 +7158,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -5843,7 +7166,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -5852,7 +7174,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -5870,70 +7191,156 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "5:\n"
+                    "st1w z24.s, p0, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p0, [c_ptr1]\n"
+                    "st1w z26.s, p0, [c_ptr2]\n"
+                    "st1w z27.s, p0, [c_ptr3]\n"
+                    "st1w z28.s, p0, [c_ptr4]\n"
+                    "st1w z29.s, p0, [c_ptr5]\n"
+                    "st1w z30.s, p0, [c_ptr6]\n"
+                    "st1w z31.s, p0, [c_ptr7]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq a_ptr4\n"
+                    ".unreq a_ptr5\n"
+                    ".unreq a_ptr6\n"
+                    ".unreq a_ptr7\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    ".unreq c_ptr4\n"
+                    ".unreq c_ptr5\n"
+                    ".unreq c_ptr6\n"
+                    ".unreq c_ptr7\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
+                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 15:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "a_ptr4 .req X3\n"
+                    "a_ptr5 .req X4\n"
+                    "a_ptr6 .req X5\n"
+                    "a_ptr7 .req X6\n"
+                    "c_ptr1 .req X7\n"
+                    "c_ptr2 .req X8\n"
+                    "c_ptr3 .req X9\n"
+                    "c_ptr4 .req X10\n"
+                    "c_ptr5 .req X11\n"
+                    "c_ptr6 .req X12\n"
+                    "c_ptr7 .req X13\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "add a_ptr4, a_ptr3, %[lda]\n"
+                    "add c_ptr4, c_ptr3, %[ldc]\n"
+                    "add a_ptr5, a_ptr4, %[lda]\n"
+                    "add c_ptr5, c_ptr4, %[ldc]\n"
+                    "add a_ptr6, a_ptr5, %[lda]\n"
+                    "add c_ptr6, c_ptr5, %[ldc]\n"
+                    "add a_ptr7, a_ptr6, %[lda]\n"
+                    "add c_ptr7, c_ptr6, %[ldc]\n"
+                    "cbz %[oob_rows], 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr7, %[c_ptr0], #0x0\n"
+                    "add a_ptr7, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr6, %[c_ptr0], #0x0\n"
+                    "add a_ptr6, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr5, %[c_ptr0], #0x0\n"
+                    "add a_ptr5, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr4, %[c_ptr0], #0x0\n"
+                    "add a_ptr4, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr3, %[c_ptr0], #0x0\n"
+                    "add a_ptr3, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr2, %[c_ptr0], #0x0\n"
+                    "add a_ptr2, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr1, %[c_ptr0], #0x0\n"
+                    "add a_ptr1, %[a_ptr0], #0x0\n"
+                    "1:\n"
+                    "ptrue p7.b\n"
+                    "whilelt p6.b, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "sdot z25.s, z20.b, z1.b[0]\n"
-                    "sdot z26.s, z20.b, z2.b[0]\n"
-                    "sdot z27.s, z20.b, z3.b[0]\n"
-                    "sdot z28.s, z20.b, z4.b[0]\n"
-                    "sdot z29.s, z20.b, z5.b[0]\n"
-                    "sdot z30.s, z20.b, z6.b[0]\n"
-                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "sdot z24.s, z21.b, z0.b[1]\n"
-                    "sdot z25.s, z21.b, z1.b[1]\n"
-                    "sdot z26.s, z21.b, z2.b[1]\n"
-                    "sdot z27.s, z21.b, z3.b[1]\n"
-                    "sdot z28.s, z21.b, z4.b[1]\n"
-                    "sdot z29.s, z21.b, z5.b[1]\n"
-                    "sdot z30.s, z21.b, z6.b[1]\n"
-                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -5986,7 +7393,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z21.b, z7.b[1]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
                     "sdot z27.s, z22.b, z3.b[2]\n"
@@ -5994,6 +7400,7 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
@@ -6011,6 +7418,7 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
@@ -6066,149 +7474,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "2:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 15:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -6347,82 +7686,71 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
@@ -6493,12 +7821,10 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -6507,7 +7833,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -6516,7 +7841,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -6534,7 +7858,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -6542,7 +7865,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -6551,7 +7873,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -6560,53 +7881,35 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "b 5f\n"
+                    "2:\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
@@ -6748,23 +8051,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -6844,54 +8140,269 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "ptrue p7.b\n"
+                    "whilelt p6.b, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "sdot z28.s, z16.b, z4.b[0]\n"
+                    "sdot z29.s, z16.b, z5.b[0]\n"
+                    "sdot z30.s, z16.b, z6.b[0]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
+                    "sdot z26.s, z17.b, z2.b[1]\n"
+                    "sdot z27.s, z17.b, z3.b[1]\n"
+                    "sdot z28.s, z17.b, z4.b[1]\n"
+                    "sdot z29.s, z17.b, z5.b[1]\n"
+                    "sdot z30.s, z17.b, z6.b[1]\n"
+                    "sdot z31.s, z17.b, z7.b[1]\n"
+                    "sdot z24.s, z18.b, z0.b[2]\n"
+                    "sdot z25.s, z18.b, z1.b[2]\n"
+                    "sdot z26.s, z18.b, z2.b[2]\n"
+                    "sdot z27.s, z18.b, z3.b[2]\n"
+                    "sdot z28.s, z18.b, z4.b[2]\n"
+                    "sdot z29.s, z18.b, z5.b[2]\n"
+                    "sdot z30.s, z18.b, z6.b[2]\n"
+                    "sdot z31.s, z18.b, z7.b[2]\n"
+                    "sdot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "sdot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "sdot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "sdot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "sdot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "sdot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "sdot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "sdot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "sdot z24.s, z20.b, z0.b[0]\n"
+                    "sdot z25.s, z20.b, z1.b[0]\n"
+                    "sdot z26.s, z20.b, z2.b[0]\n"
+                    "sdot z27.s, z20.b, z3.b[0]\n"
+                    "sdot z28.s, z20.b, z4.b[0]\n"
+                    "sdot z29.s, z20.b, z5.b[0]\n"
+                    "sdot z30.s, z20.b, z6.b[0]\n"
+                    "sdot z31.s, z20.b, z7.b[0]\n"
+                    "sdot z24.s, z21.b, z0.b[1]\n"
+                    "sdot z25.s, z21.b, z1.b[1]\n"
+                    "sdot z26.s, z21.b, z2.b[1]\n"
+                    "sdot z27.s, z21.b, z3.b[1]\n"
+                    "sdot z28.s, z21.b, z4.b[1]\n"
+                    "sdot z29.s, z21.b, z5.b[1]\n"
+                    "sdot z30.s, z21.b, z6.b[1]\n"
+                    "sdot z31.s, z21.b, z7.b[1]\n"
+                    "sdot z24.s, z22.b, z0.b[2]\n"
+                    "sdot z25.s, z22.b, z1.b[2]\n"
+                    "sdot z26.s, z22.b, z2.b[2]\n"
+                    "sdot z27.s, z22.b, z3.b[2]\n"
+                    "sdot z28.s, z22.b, z4.b[2]\n"
+                    "sdot z29.s, z22.b, z5.b[2]\n"
+                    "sdot z30.s, z22.b, z6.b[2]\n"
+                    "sdot z31.s, z22.b, z7.b[2]\n"
+                    "sdot z24.s, z23.b, z0.b[3]\n"
+                    "sdot z25.s, z23.b, z1.b[3]\n"
+                    "sdot z26.s, z23.b, z2.b[3]\n"
+                    "sdot z27.s, z23.b, z3.b[3]\n"
+                    "sdot z28.s, z23.b, z4.b[3]\n"
+                    "sdot z29.s, z23.b, z5.b[3]\n"
+                    "sdot z30.s, z23.b, z6.b[3]\n"
+                    "sdot z31.s, z23.b, z7.b[3]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -7039,83 +8550,72 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z23.b, z5.b[3]\n"
                     "sdot z30.s, z23.b, z6.b[3]\n"
                     "sdot z31.s, z23.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "sdot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
@@ -7190,7 +8690,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
@@ -7199,7 +8698,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
                     "sdot z26.s, z18.b, z2.b[2]\n"
@@ -7208,7 +8706,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z18.b, z5.b[2]\n"
                     "sdot z30.s, z18.b, z6.b[2]\n"
                     "sdot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "sdot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "sdot z25.s, z19.b, z1.b[3]\n"
@@ -7226,7 +8723,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "sdot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "sdot z25.s, z20.b, z1.b[0]\n"
                     "sdot z26.s, z20.b, z2.b[0]\n"
                     "sdot z27.s, z20.b, z3.b[0]\n"
@@ -7234,7 +8730,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z20.b, z5.b[0]\n"
                     "sdot z30.s, z20.b, z6.b[0]\n"
                     "sdot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "sdot z24.s, z21.b, z0.b[1]\n"
                     "sdot z25.s, z21.b, z1.b[1]\n"
                     "sdot z26.s, z21.b, z2.b[1]\n"
@@ -7243,7 +8738,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z21.b, z5.b[1]\n"
                     "sdot z30.s, z21.b, z6.b[1]\n"
                     "sdot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "sdot z24.s, z22.b, z0.b[2]\n"
                     "sdot z25.s, z22.b, z1.b[2]\n"
                     "sdot z26.s, z22.b, z2.b[2]\n"
@@ -7252,7 +8746,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z22.b, z5.b[2]\n"
                     "sdot z30.s, z22.b, z6.b[2]\n"
                     "sdot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "sdot z24.s, z23.b, z0.b[3]\n"
                     "sdot z25.s, z23.b, z1.b[3]\n"
                     "sdot z26.s, z23.b, z2.b[3]\n"
@@ -7261,54 +8754,36 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z23.b, z5.b[3]\n"
                     "sdot z30.s, z23.b, z6.b[3]\n"
                     "sdot z31.s, z23.b, z7.b[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "b 5f\n"
+                    "2:\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "sdot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "sdot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "sdot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "sdot z27.s, z16.b, z3.b[0]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "sdot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "sdot z24.s, z16.b, z0.b[0]\n"
+                    "sdot z25.s, z16.b, z1.b[0]\n"
+                    "sdot z26.s, z16.b, z2.b[0]\n"
+                    "sdot z27.s, z16.b, z3.b[0]\n"
                     "sdot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "sdot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "sdot z24.s, z17.b, z0.b[1]\n"
+                    "sdot z25.s, z17.b, z1.b[1]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
@@ -7458,23 +8933,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z23.b, z5.b[3]\n"
                     "sdot z30.s, z23.b, z6.b[3]\n"
                     "sdot z31.s, z23.b, z7.b[3]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp
similarity index 91%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp
index 70a0b12130..60184be043 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
 
-class smallK_hybrid_u8u32_dot_1VLx8
+class cls_sve_smallK_hybrid_u8u32_dot_8x1VL
 {
 public:
     typedef uint8_t operand_type;
@@ -75,9 +75,9 @@ class smallK_hybrid_u8u32_dot_1VLx8
     StdTransformsSVE<operand_type, result_type, 8, 1, 4> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_smallK_hybrid_u8u32_dot_1VLx8;
+    kern_type kernel=sve_smallK_hybrid_u8u32_dot_8x1VL;
 
-    smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *)
+    cls_sve_smallK_hybrid_u8u32_dot_8x1VL(const CPUInfo *)
     {
 
     }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
similarity index 82%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
index 1d0b84e788..b980d9b5c2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool) {
     const long loops_count = iceildiv(N, (int)get_vector_length<uint32_t>()) - 1;
     const long ldab = lda * sizeof(uint8_t);
     const long ldcb = ldc * sizeof(uint32_t);
@@ -112,55 +112,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
                     "mov z27.s, #0\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
-                    "cbz %[loops], 2f\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
@@ -186,10 +185,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -201,6 +199,8 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
@@ -230,23 +230,34 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -325,112 +336,112 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "mov z28.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
@@ -470,23 +481,42 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -565,48 +595,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -618,49 +650,46 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "mov z28.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
@@ -676,7 +705,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -690,11 +718,12 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
@@ -720,8 +749,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
@@ -737,23 +767,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -832,50 +889,52 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
                     "mov z29.s, #0\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1]\n"
                     "mov z30.s, #0\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2]\n"
                     "mov z31.s, #0\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
@@ -894,50 +953,47 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z19.b, z5.b[3]\n"
                     "udot z30.s, z19.b, z6.b[3]\n"
                     "udot z31.s, z19.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "mov z28.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
@@ -953,7 +1009,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -962,7 +1017,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
                     "udot z26.s, z19.b, z2.b[3]\n"
@@ -976,14 +1030,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
@@ -1006,8 +1062,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
@@ -1031,23 +1088,58 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z19.b, z5.b[3]\n"
                     "udot z30.s, z19.b, z6.b[3]\n"
                     "udot z31.s, z19.b, z7.b[3]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "mov z25.s, #0\n"
+                    "mov z26.s, #0\n"
+                    "mov z27.s, #0\n"
+                    "mov z28.s, #0\n"
+                    "mov z29.s, #0\n"
+                    "mov z30.s, #0\n"
+                    "mov z31.s, #0\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1126,46 +1218,48 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
@@ -1205,84 +1299,79 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -1300,7 +1389,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -1313,39 +1401,41 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
@@ -1354,6 +1444,8 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -1392,23 +1484,82 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1487,48 +1638,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -1575,85 +1728,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -1671,7 +1819,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -1679,7 +1826,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -1693,47 +1839,52 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -1780,23 +1931,90 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -1875,48 +2093,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
@@ -1972,86 +2192,81 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #7\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -2069,7 +2284,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -2077,7 +2291,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -2086,7 +2299,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -2100,47 +2312,135 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "b 5f\n"
+                    "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -2195,23 +2495,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2290,49 +2583,51 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
@@ -2396,87 +2691,82 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z23.b, z5.b[3]\n"
                     "udot z30.s, z23.b, z6.b[3]\n"
                     "udot z31.s, z23.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -2494,7 +2784,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -2502,7 +2791,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -2511,7 +2799,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -2520,7 +2807,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
                     "udot z26.s, z23.b, z2.b[3]\n"
@@ -2534,47 +2820,144 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "b 5f\n"
+                    "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -2637,23 +3020,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z23.b, z5.b[3]\n"
                     "udot z30.s, z23.b, z6.b[3]\n"
                     "udot z31.s, z23.b, z7.b[3]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -2732,54 +3108,56 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
@@ -2856,88 +3234,84 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "b.eq 3f\n"
                     "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -2955,7 +3329,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -2963,7 +3336,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -2972,7 +3344,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -2981,7 +3352,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
@@ -2999,7 +3369,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -3007,55 +3376,62 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -3133,23 +3509,124 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "b 5f\n"
                     "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -3228,52 +3705,998 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "b 5f\n"
+                    "2:\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "5:\n"
+                    "st1w z24.s, p0, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p0, [c_ptr1]\n"
+                    "st1w z26.s, p0, [c_ptr2]\n"
+                    "st1w z27.s, p0, [c_ptr3]\n"
+                    "st1w z28.s, p0, [c_ptr4]\n"
+                    "st1w z29.s, p0, [c_ptr5]\n"
+                    "st1w z30.s, p0, [c_ptr6]\n"
+                    "st1w z31.s, p0, [c_ptr7]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq a_ptr4\n"
+                    ".unreq a_ptr5\n"
+                    ".unreq a_ptr6\n"
+                    ".unreq a_ptr7\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    ".unreq c_ptr4\n"
+                    ".unreq c_ptr5\n"
+                    ".unreq c_ptr6\n"
+                    ".unreq c_ptr7\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
+                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 11:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "a_ptr4 .req X3\n"
+                    "a_ptr5 .req X4\n"
+                    "a_ptr6 .req X5\n"
+                    "a_ptr7 .req X6\n"
+                    "c_ptr1 .req X7\n"
+                    "c_ptr2 .req X8\n"
+                    "c_ptr3 .req X9\n"
+                    "c_ptr4 .req X10\n"
+                    "c_ptr5 .req X11\n"
+                    "c_ptr6 .req X12\n"
+                    "c_ptr7 .req X13\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "add a_ptr4, a_ptr3, %[lda]\n"
+                    "add c_ptr4, c_ptr3, %[ldc]\n"
+                    "add a_ptr5, a_ptr4, %[lda]\n"
+                    "add c_ptr5, c_ptr4, %[ldc]\n"
+                    "add a_ptr6, a_ptr5, %[lda]\n"
+                    "add c_ptr6, c_ptr5, %[ldc]\n"
+                    "add a_ptr7, a_ptr6, %[lda]\n"
+                    "add c_ptr7, c_ptr6, %[ldc]\n"
+                    "cbz %[oob_rows], 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr7, %[c_ptr0], #0x0\n"
+                    "add a_ptr7, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr6, %[c_ptr0], #0x0\n"
+                    "add a_ptr6, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr5, %[c_ptr0], #0x0\n"
+                    "add a_ptr5, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr4, %[c_ptr0], #0x0\n"
+                    "add a_ptr4, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr3, %[c_ptr0], #0x0\n"
+                    "add a_ptr3, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr2, %[c_ptr0], #0x0\n"
+                    "add a_ptr2, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr1, %[c_ptr0], #0x0\n"
+                    "add a_ptr1, %[a_ptr0], #0x0\n"
+                    "1:\n"
+                    "ptrue p7.b\n"
+                    "whilelt p6.b, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "mov z24.s, #0\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -3281,7 +4704,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
@@ -3289,6 +4711,7 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -3306,6 +4729,7 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -3361,84 +4785,55 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "b 5f\n"
+                    "2:\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
@@ -3460,7 +4855,7 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -3468,7 +4863,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -3477,7 +4871,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -3486,7 +4879,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
@@ -3504,7 +4896,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -3512,7 +4903,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -3521,53 +4911,149 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "5:\n"
+                    "st1w z24.s, p0, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p0, [c_ptr1]\n"
+                    "st1w z26.s, p0, [c_ptr2]\n"
+                    "st1w z27.s, p0, [c_ptr3]\n"
+                    "st1w z28.s, p0, [c_ptr4]\n"
+                    "st1w z29.s, p0, [c_ptr5]\n"
+                    "st1w z30.s, p0, [c_ptr6]\n"
+                    "st1w z31.s, p0, [c_ptr7]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq a_ptr4\n"
+                    ".unreq a_ptr5\n"
+                    ".unreq a_ptr6\n"
+                    ".unreq a_ptr7\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    ".unreq c_ptr4\n"
+                    ".unreq c_ptr5\n"
+                    ".unreq c_ptr6\n"
+                    ".unreq c_ptr7\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
+                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 12:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "a_ptr4 .req X3\n"
+                    "a_ptr5 .req X4\n"
+                    "a_ptr6 .req X5\n"
+                    "a_ptr7 .req X6\n"
+                    "c_ptr1 .req X7\n"
+                    "c_ptr2 .req X8\n"
+                    "c_ptr3 .req X9\n"
+                    "c_ptr4 .req X10\n"
+                    "c_ptr5 .req X11\n"
+                    "c_ptr6 .req X12\n"
+                    "c_ptr7 .req X13\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "add a_ptr4, a_ptr3, %[lda]\n"
+                    "add c_ptr4, c_ptr3, %[ldc]\n"
+                    "add a_ptr5, a_ptr4, %[lda]\n"
+                    "add c_ptr5, c_ptr4, %[ldc]\n"
+                    "add a_ptr6, a_ptr5, %[lda]\n"
+                    "add c_ptr6, c_ptr5, %[ldc]\n"
+                    "add a_ptr7, a_ptr6, %[lda]\n"
+                    "add c_ptr7, c_ptr6, %[ldc]\n"
+                    "cbz %[oob_rows], 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr7, %[c_ptr0], #0x0\n"
+                    "add a_ptr7, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr6, %[c_ptr0], #0x0\n"
+                    "add a_ptr6, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr5, %[c_ptr0], #0x0\n"
+                    "add a_ptr5, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr4, %[c_ptr0], #0x0\n"
+                    "add a_ptr4, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr3, %[c_ptr0], #0x0\n"
+                    "add a_ptr3, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr2, %[c_ptr0], #0x0\n"
+                    "add a_ptr2, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr1, %[c_ptr0], #0x0\n"
+                    "add a_ptr1, %[a_ptr0], #0x0\n"
+                    "1:\n"
+                    "ptrue p7.b\n"
+                    "whilelt p6.b, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -3576,7 +5062,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #2\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
@@ -3584,6 +5069,7 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -3601,7 +5087,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
                     "udot z28.s, z20.b, z4.b[0]\n"
@@ -3656,149 +5144,88 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "2:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 11:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -3829,8 +5256,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
                     "udot z28.s, z20.b, z4.b[0]\n"
@@ -3893,82 +5321,79 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
@@ -3991,16 +5416,15 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z28.s, z20.b, z4.b[0]\n"
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -4009,7 +5433,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -4018,7 +5441,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
@@ -4036,7 +5458,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -4044,7 +5465,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -4053,7 +5473,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -4062,53 +5481,43 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "b 5f\n"
+                    "2:\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -4142,8 +5551,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #3\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
                     "udot z28.s, z20.b, z4.b[0]\n"
@@ -4206,23 +5616,24 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "2:\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -4242,7 +5653,7 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                 );
                 break;
-            case 12:
+            case 13:
                 __asm __volatile (
                     "a_ptr1 .req X0\n"
                     "a_ptr2 .req X1\n"
@@ -4301,54 +5712,242 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "ptrue p7.b\n"
+                    "whilelt p6.b, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -4381,14 +5980,15 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z24.s, z20.b, z0.b[0]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
                     "udot z28.s, z20.b, z4.b[0]\n"
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
                     "udot z27.s, z21.b, z3.b[1]\n"
@@ -4405,21 +6005,21 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
                     "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
                     "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
                     "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
                     "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
                     "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
                     "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
@@ -4445,89 +6045,94 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
                     "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
                     "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
                     "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
                     "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
                     "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
                     "udot z31.s, z19.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
@@ -4552,7 +6157,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z24.s, z20.b, z0.b[0]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
                     "udot z28.s, z20.b, z4.b[0]\n"
@@ -4561,6 +6165,7 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z20.b, z7.b[0]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
                     "udot z27.s, z21.b, z3.b[1]\n"
@@ -4568,7 +6173,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -4577,25 +6181,23 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
                     "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
                     "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
                     "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
                     "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
                     "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
                     "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -4603,7 +6205,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -4612,7 +6213,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -4621,62 +6221,59 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
                     "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
                     "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
                     "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
                     "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
                     "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
                     "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "b 5f\n"
+                    "2:\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -4712,14 +6309,15 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z24.s, z20.b, z0.b[0]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
                     "udot z28.s, z20.b, z4.b[0]\n"
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
                     "udot z27.s, z21.b, z3.b[1]\n"
@@ -4736,21 +6334,21 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
                     "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
                     "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
                     "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
                     "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
                     "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
                     "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
@@ -4776,30 +6374,39 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
                     "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
                     "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
                     "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
                     "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
                     "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
                     "udot z31.s, z19.b, z7.b[3]\n"
-                    "2:\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -4819,7 +6426,7 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                 );
                 break;
-            case 13:
+            case 14:
                 __asm __volatile (
                     "a_ptr1 .req X0\n"
                     "a_ptr2 .req X1\n"
@@ -4878,250 +6485,64 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
-                    "mov z24.s, #0\n"
                     "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
-                    "udot z24.s, z22.b, z0.b[2]\n"
-                    "udot z25.s, z22.b, z1.b[2]\n"
-                    "udot z26.s, z22.b, z2.b[2]\n"
-                    "udot z27.s, z22.b, z3.b[2]\n"
-                    "udot z28.s, z22.b, z4.b[2]\n"
-                    "udot z29.s, z22.b, z5.b[2]\n"
-                    "udot z30.s, z22.b, z6.b[2]\n"
-                    "udot z31.s, z22.b, z7.b[2]\n"
-                    "udot z24.s, z23.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
-                    "udot z25.s, z23.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
-                    "udot z26.s, z23.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
-                    "udot z27.s, z23.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
-                    "udot z28.s, z23.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
-                    "udot z29.s, z23.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
-                    "udot z30.s, z23.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
-                    "udot z31.s, z23.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "udot z26.s, z17.b, z2.b[1]\n"
-                    "udot z27.s, z17.b, z3.b[1]\n"
-                    "udot z28.s, z17.b, z4.b[1]\n"
-                    "udot z29.s, z17.b, z5.b[1]\n"
-                    "udot z30.s, z17.b, z6.b[1]\n"
-                    "udot z31.s, z17.b, z7.b[1]\n"
-                    "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "cbz %[loops], 2f\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
@@ -5154,7 +6575,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z20.b, z7.b[0]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
                     "udot z27.s, z21.b, z3.b[1]\n"
@@ -5164,6 +6584,7 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z21.b, z7.b[1]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
                     "udot z27.s, z22.b, z3.b[2]\n"
@@ -5171,7 +6592,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
@@ -5189,7 +6609,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -5197,7 +6616,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -5206,7 +6624,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -5215,7 +6632,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -5233,7 +6649,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -5241,56 +6656,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p7, [c_ptr1]\n"
                     "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "mov z26.s, #0\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
                     "st1w z30.s, p7, [c_ptr6]\n"
                     "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "st1w z31.s, p7, [c_ptr7]\n"
                     "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
                     "addvl c_ptr6, c_ptr6, #1\n"
-                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -5331,7 +6770,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z20.b, z7.b[0]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
                     "udot z27.s, z21.b, z3.b[1]\n"
@@ -5339,7 +6777,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
                     "udot z27.s, z22.b, z3.b[2]\n"
@@ -5380,178 +6820,100 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "udot z25.s, z18.b, z1.b[2]\n"
-                    "udot z26.s, z18.b, z2.b[2]\n"
-                    "udot z27.s, z18.b, z3.b[2]\n"
-                    "udot z28.s, z18.b, z4.b[2]\n"
-                    "udot z29.s, z18.b, z5.b[2]\n"
-                    "udot z30.s, z18.b, z6.b[2]\n"
-                    "udot z31.s, z18.b, z7.b[2]\n"
-                    "udot z24.s, z19.b, z0.b[3]\n"
-                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
-                    "udot z25.s, z19.b, z1.b[3]\n"
-                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
-                    "udot z26.s, z19.b, z2.b[3]\n"
-                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
-                    "udot z27.s, z19.b, z3.b[3]\n"
-                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
-                    "udot z28.s, z19.b, z4.b[3]\n"
-                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
-                    "udot z29.s, z19.b, z5.b[3]\n"
-                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
-                    "udot z30.s, z19.b, z6.b[3]\n"
-                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
-                    "udot z31.s, z19.b, z7.b[3]\n"
-                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
-                    "udot z24.s, z20.b, z0.b[0]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
-                    "2:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 14:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z24.s, #0\n"
-                    "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -5683,82 +7045,46 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "cbz %[loops], 2f\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "b 5f\n"
+                    "2:\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
@@ -5808,7 +7134,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
@@ -5826,7 +7151,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -5834,7 +7158,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -5843,7 +7166,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -5852,7 +7174,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -5870,70 +7191,156 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "5:\n"
+                    "st1w z24.s, p0, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p0, [c_ptr1]\n"
+                    "st1w z26.s, p0, [c_ptr2]\n"
+                    "st1w z27.s, p0, [c_ptr3]\n"
+                    "st1w z28.s, p0, [c_ptr4]\n"
+                    "st1w z29.s, p0, [c_ptr5]\n"
+                    "st1w z30.s, p0, [c_ptr6]\n"
+                    "st1w z31.s, p0, [c_ptr7]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq a_ptr4\n"
+                    ".unreq a_ptr5\n"
+                    ".unreq a_ptr6\n"
+                    ".unreq a_ptr7\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    ".unreq c_ptr4\n"
+                    ".unreq c_ptr5\n"
+                    ".unreq c_ptr6\n"
+                    ".unreq c_ptr7\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
+                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
+                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                );
+                break;
+            case 15:
+                __asm __volatile (
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "a_ptr4 .req X3\n"
+                    "a_ptr5 .req X4\n"
+                    "a_ptr6 .req X5\n"
+                    "a_ptr7 .req X6\n"
+                    "c_ptr1 .req X7\n"
+                    "c_ptr2 .req X8\n"
+                    "c_ptr3 .req X9\n"
+                    "c_ptr4 .req X10\n"
+                    "c_ptr5 .req X11\n"
+                    "c_ptr6 .req X12\n"
+                    "c_ptr7 .req X13\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "add a_ptr4, a_ptr3, %[lda]\n"
+                    "add c_ptr4, c_ptr3, %[ldc]\n"
+                    "add a_ptr5, a_ptr4, %[lda]\n"
+                    "add c_ptr5, c_ptr4, %[ldc]\n"
+                    "add a_ptr6, a_ptr5, %[lda]\n"
+                    "add c_ptr6, c_ptr5, %[ldc]\n"
+                    "add a_ptr7, a_ptr6, %[lda]\n"
+                    "add c_ptr7, c_ptr6, %[ldc]\n"
+                    "cbz %[oob_rows], 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr7, %[c_ptr0], #0x0\n"
+                    "add a_ptr7, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr6, %[c_ptr0], #0x0\n"
+                    "add a_ptr6, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr5, %[c_ptr0], #0x0\n"
+                    "add a_ptr5, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr4, %[c_ptr0], #0x0\n"
+                    "add a_ptr4, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr3, %[c_ptr0], #0x0\n"
+                    "add a_ptr3, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr2, %[c_ptr0], #0x0\n"
+                    "add a_ptr2, %[a_ptr0], #0x0\n"
+                    "b.eq 1f\n"
+                    "subs %[oob_rows], %[oob_rows], #0x1\n"
+                    "add c_ptr1, %[c_ptr0], #0x0\n"
+                    "add a_ptr1, %[a_ptr0], #0x0\n"
+                    "1:\n"
+                    "ptrue p7.b\n"
+                    "whilelt p6.b, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "udot z25.s, z20.b, z1.b[0]\n"
-                    "udot z26.s, z20.b, z2.b[0]\n"
-                    "udot z27.s, z20.b, z3.b[0]\n"
-                    "udot z28.s, z20.b, z4.b[0]\n"
-                    "udot z29.s, z20.b, z5.b[0]\n"
-                    "udot z30.s, z20.b, z6.b[0]\n"
-                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "udot z24.s, z21.b, z0.b[1]\n"
-                    "udot z25.s, z21.b, z1.b[1]\n"
-                    "udot z26.s, z21.b, z2.b[1]\n"
-                    "udot z27.s, z21.b, z3.b[1]\n"
-                    "udot z28.s, z21.b, z4.b[1]\n"
-                    "udot z29.s, z21.b, z5.b[1]\n"
-                    "udot z30.s, z21.b, z6.b[1]\n"
-                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -5986,7 +7393,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z21.b, z7.b[1]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
                     "udot z27.s, z22.b, z3.b[2]\n"
@@ -5994,6 +7400,7 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
@@ -6011,6 +7418,7 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z23.b, z7.b[3]\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #7\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
@@ -6066,149 +7474,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "2:\n"
-                    "st1w z24.s, p0, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
-                    "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
-                    "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
-                    "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
-                    "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
-                    ".unreq a_ptr1\n"
-                    ".unreq a_ptr2\n"
-                    ".unreq a_ptr3\n"
-                    ".unreq a_ptr4\n"
-                    ".unreq a_ptr5\n"
-                    ".unreq a_ptr6\n"
-                    ".unreq a_ptr7\n"
-                    ".unreq c_ptr1\n"
-                    ".unreq c_ptr2\n"
-                    ".unreq c_ptr3\n"
-                    ".unreq c_ptr4\n"
-                    ".unreq c_ptr5\n"
-                    ".unreq c_ptr6\n"
-                    ".unreq c_ptr7\n"
-                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [temp] "+r" (temp), [odds] "+r" (odds)
-                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [odd_depth] "r" (odd_depth), [last_width] "r" (last_width)
-                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                );
-                break;
-            case 15:
-                __asm __volatile (
-                    "a_ptr1 .req X0\n"
-                    "a_ptr2 .req X1\n"
-                    "a_ptr3 .req X2\n"
-                    "a_ptr4 .req X3\n"
-                    "a_ptr5 .req X4\n"
-                    "a_ptr6 .req X5\n"
-                    "a_ptr7 .req X6\n"
-                    "c_ptr1 .req X7\n"
-                    "c_ptr2 .req X8\n"
-                    "c_ptr3 .req X9\n"
-                    "c_ptr4 .req X10\n"
-                    "c_ptr5 .req X11\n"
-                    "c_ptr6 .req X12\n"
-                    "c_ptr7 .req X13\n"
-                    "add a_ptr1, %[a_ptr0], %[lda]\n"
-                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                    "add a_ptr2, a_ptr1, %[lda]\n"
-                    "add c_ptr2, c_ptr1, %[ldc]\n"
-                    "add a_ptr3, a_ptr2, %[lda]\n"
-                    "add c_ptr3, c_ptr2, %[ldc]\n"
-                    "add a_ptr4, a_ptr3, %[lda]\n"
-                    "add c_ptr4, c_ptr3, %[ldc]\n"
-                    "add a_ptr5, a_ptr4, %[lda]\n"
-                    "add c_ptr5, c_ptr4, %[ldc]\n"
-                    "add a_ptr6, a_ptr5, %[lda]\n"
-                    "add c_ptr6, c_ptr5, %[ldc]\n"
-                    "add a_ptr7, a_ptr6, %[lda]\n"
-                    "add c_ptr7, c_ptr6, %[ldc]\n"
-                    "cbz %[oob_rows], 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr7, %[c_ptr0], #0x0\n"
-                    "add a_ptr7, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr6, %[c_ptr0], #0x0\n"
-                    "add a_ptr6, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr5, %[c_ptr0], #0x0\n"
-                    "add a_ptr5, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr4, %[c_ptr0], #0x0\n"
-                    "add a_ptr4, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr3, %[c_ptr0], #0x0\n"
-                    "add a_ptr3, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr2, %[c_ptr0], #0x0\n"
-                    "add a_ptr2, %[a_ptr0], #0x0\n"
-                    "b.eq 1f\n"
-                    "subs %[oob_rows], %[oob_rows], #0x1\n"
-                    "add c_ptr1, %[c_ptr0], #0x0\n"
-                    "add a_ptr1, %[a_ptr0], #0x0\n"
-                    "1:\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -6347,82 +7686,71 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "cbz %[loops], 2f\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "mov z24.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                     "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
-                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
@@ -6493,12 +7821,10 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -6507,7 +7833,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -6516,7 +7841,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -6534,7 +7858,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -6542,7 +7865,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -6551,7 +7873,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -6560,53 +7881,35 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "b 5f\n"
+                    "2:\n"
                     "mov z24.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
-                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                     "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
@@ -6748,23 +8051,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
@@ -6844,54 +8140,269 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "add c_ptr1, %[c_ptr0], #0x0\n"
                     "add a_ptr1, %[a_ptr0], #0x0\n"
                     "1:\n"
+                    "ptrue p7.b\n"
+                    "whilelt p6.b, %[temp], %[odd_depth]\n"
+                    "whilelt p0.s, %[temp], %[last_width]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "cbz %[loops], 2f\n"
+                    "mov z24.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                    "mov z25.s, #0\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                    "mov z26.s, #0\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+                    "mov z29.s, #0\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+                    "mov z30.s, #0\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+                    "mov z31.s, #0\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "udot z28.s, z16.b, z4.b[0]\n"
+                    "udot z29.s, z16.b, z5.b[0]\n"
+                    "udot z30.s, z16.b, z6.b[0]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
+                    "udot z26.s, z17.b, z2.b[1]\n"
+                    "udot z27.s, z17.b, z3.b[1]\n"
+                    "udot z28.s, z17.b, z4.b[1]\n"
+                    "udot z29.s, z17.b, z5.b[1]\n"
+                    "udot z30.s, z17.b, z6.b[1]\n"
+                    "udot z31.s, z17.b, z7.b[1]\n"
+                    "udot z24.s, z18.b, z0.b[2]\n"
+                    "udot z25.s, z18.b, z1.b[2]\n"
+                    "udot z26.s, z18.b, z2.b[2]\n"
+                    "udot z27.s, z18.b, z3.b[2]\n"
+                    "udot z28.s, z18.b, z4.b[2]\n"
+                    "udot z29.s, z18.b, z5.b[2]\n"
+                    "udot z30.s, z18.b, z6.b[2]\n"
+                    "udot z31.s, z18.b, z7.b[2]\n"
+                    "udot z24.s, z19.b, z0.b[3]\n"
+                    "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+                    "udot z25.s, z19.b, z1.b[3]\n"
+                    "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+                    "udot z26.s, z19.b, z2.b[3]\n"
+                    "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+                    "udot z27.s, z19.b, z3.b[3]\n"
+                    "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+                    "udot z28.s, z19.b, z4.b[3]\n"
+                    "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+                    "udot z29.s, z19.b, z5.b[3]\n"
+                    "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+                    "udot z30.s, z19.b, z6.b[3]\n"
+                    "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+                    "udot z31.s, z19.b, z7.b[3]\n"
+                    "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+                    "udot z24.s, z20.b, z0.b[0]\n"
+                    "udot z25.s, z20.b, z1.b[0]\n"
+                    "udot z26.s, z20.b, z2.b[0]\n"
+                    "udot z27.s, z20.b, z3.b[0]\n"
+                    "udot z28.s, z20.b, z4.b[0]\n"
+                    "udot z29.s, z20.b, z5.b[0]\n"
+                    "udot z30.s, z20.b, z6.b[0]\n"
+                    "udot z31.s, z20.b, z7.b[0]\n"
+                    "udot z24.s, z21.b, z0.b[1]\n"
+                    "udot z25.s, z21.b, z1.b[1]\n"
+                    "udot z26.s, z21.b, z2.b[1]\n"
+                    "udot z27.s, z21.b, z3.b[1]\n"
+                    "udot z28.s, z21.b, z4.b[1]\n"
+                    "udot z29.s, z21.b, z5.b[1]\n"
+                    "udot z30.s, z21.b, z6.b[1]\n"
+                    "udot z31.s, z21.b, z7.b[1]\n"
+                    "udot z24.s, z22.b, z0.b[2]\n"
+                    "udot z25.s, z22.b, z1.b[2]\n"
+                    "udot z26.s, z22.b, z2.b[2]\n"
+                    "udot z27.s, z22.b, z3.b[2]\n"
+                    "udot z28.s, z22.b, z4.b[2]\n"
+                    "udot z29.s, z22.b, z5.b[2]\n"
+                    "udot z30.s, z22.b, z6.b[2]\n"
+                    "udot z31.s, z22.b, z7.b[2]\n"
+                    "udot z24.s, z23.b, z0.b[3]\n"
+                    "udot z25.s, z23.b, z1.b[3]\n"
+                    "udot z26.s, z23.b, z2.b[3]\n"
+                    "udot z27.s, z23.b, z3.b[3]\n"
+                    "udot z28.s, z23.b, z4.b[3]\n"
+                    "udot z29.s, z23.b, z5.b[3]\n"
+                    "udot z30.s, z23.b, z6.b[3]\n"
+                    "udot z31.s, z23.b, z7.b[3]\n"
+                    "b.eq 3f\n"
+                    "4:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "subs %[loops], %[loops], #0x1\n"
                     "mov z24.s, #0\n"
-                    "ptrue p7.b\n"
-                    "mov z25.s, #0\n"
-                    "whilelt p6.b, %[temp], %[odd_depth]\n"
-                    "mov z26.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "mov z27.s, #0\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "mov z28.s, #0\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "mov z29.s, #0\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "mov z30.s, #0\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "mov z31.s, #0\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "whilelt p0.s, %[temp], %[last_width]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
+                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
+                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -7039,83 +8550,72 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z23.b, z5.b[3]\n"
                     "udot z30.s, z23.b, z6.b[3]\n"
                     "udot z31.s, z23.b, z7.b[3]\n"
-                    "cbz %[loops], 2f\n"
+                    "b.ne 4b\n"
+                    "3:\n"
+                    "st1w z24.s, p7, [%[c_ptr0]]\n"
+                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "mov z24.s, #0\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                    "st1w z25.s, p7, [c_ptr1]\n"
+                    "addvl c_ptr1, c_ptr1, #1\n"
+                    "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                    "st1w z26.s, p7, [c_ptr2]\n"
+                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                    "st1w z27.s, p7, [c_ptr3]\n"
+                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "mov z27.s, #0\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
-                    "b.eq 3f\n"
-                    "4:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "subs %[loops], %[loops], #0x1\n"
-                    "mov z24.s, #0\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
-                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
-                    "mov z25.s, #0\n"
+                    "st1w z28.s, p7, [c_ptr4]\n"
+                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "mov z28.s, #0\n"
+                    "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
-                    "mov z26.s, #0\n"
+                    "st1w z29.s, p7, [c_ptr5]\n"
+                    "addvl c_ptr5, c_ptr5, #1\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "st1w z30.s, p7, [c_ptr6]\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
-                    "mov z27.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
                     "udot z26.s, z16.b, z2.b[0]\n"
+                    "st1w z31.s, p7, [c_ptr7]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
-                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z27.s, z16.b, z3.b[0]\n"
-                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
+                    "ld1rqb z6.b, p7/z, [a_ptr6]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
+                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z18.b, z3.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z18.b, z4.b[2]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
@@ -7190,7 +8690,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
@@ -7199,7 +8698,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
-                    "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
                     "udot z26.s, z18.b, z2.b[2]\n"
@@ -7208,7 +8706,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z18.b, z5.b[2]\n"
                     "udot z30.s, z18.b, z6.b[2]\n"
                     "udot z31.s, z18.b, z7.b[2]\n"
-                    "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "udot z24.s, z19.b, z0.b[3]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
                     "udot z25.s, z19.b, z1.b[3]\n"
@@ -7226,7 +8723,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z19.b, z7.b[3]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
                     "udot z24.s, z20.b, z0.b[0]\n"
-                    "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
                     "udot z25.s, z20.b, z1.b[0]\n"
                     "udot z26.s, z20.b, z2.b[0]\n"
                     "udot z27.s, z20.b, z3.b[0]\n"
@@ -7234,7 +8730,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z20.b, z5.b[0]\n"
                     "udot z30.s, z20.b, z6.b[0]\n"
                     "udot z31.s, z20.b, z7.b[0]\n"
-                    "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "udot z24.s, z21.b, z0.b[1]\n"
                     "udot z25.s, z21.b, z1.b[1]\n"
                     "udot z26.s, z21.b, z2.b[1]\n"
@@ -7243,7 +8738,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z21.b, z5.b[1]\n"
                     "udot z30.s, z21.b, z6.b[1]\n"
                     "udot z31.s, z21.b, z7.b[1]\n"
-                    "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "udot z24.s, z22.b, z0.b[2]\n"
                     "udot z25.s, z22.b, z1.b[2]\n"
                     "udot z26.s, z22.b, z2.b[2]\n"
@@ -7252,7 +8746,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z22.b, z5.b[2]\n"
                     "udot z30.s, z22.b, z6.b[2]\n"
                     "udot z31.s, z22.b, z7.b[2]\n"
-                    "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "udot z24.s, z23.b, z0.b[3]\n"
                     "udot z25.s, z23.b, z1.b[3]\n"
                     "udot z26.s, z23.b, z2.b[3]\n"
@@ -7261,54 +8754,36 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z23.b, z5.b[3]\n"
                     "udot z30.s, z23.b, z6.b[3]\n"
                     "udot z31.s, z23.b, z7.b[3]\n"
-                    "b.ne 4b\n"
-                    "3:\n"
-                    "st1w z24.s, p7, [%[c_ptr0]]\n"
-                    "addvl %[c_ptr0], %[c_ptr0], #1\n"
+                    "b 5f\n"
+                    "2:\n"
                     "mov z24.s, #0\n"
-                    "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #8\n"
-                    "st1w z25.s, p7, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "mov z25.s, #0\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "udot z24.s, z16.b, z0.b[0]\n"
-                    "st1w z26.s, p7, [c_ptr2]\n"
                     "mov z26.s, #0\n"
                     "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
-                    "udot z25.s, z16.b, z1.b[0]\n"
-                    "st1w z27.s, p7, [c_ptr3]\n"
                     "mov z27.s, #0\n"
+                    "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                    "mov z28.s, #0\n"
                     "ld1rqb z4.b, p7/z, [a_ptr4]\n"
-                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "mov z29.s, #0\n"
                     "ld1rqb z5.b, p7/z, [a_ptr5]\n"
-                    "udot z24.s, z17.b, z0.b[1]\n"
-                    "st1w z28.s, p7, [c_ptr4]\n"
-                    "mov z28.s, #0\n"
+                    "mov z30.s, #0\n"
                     "ld1rqb z6.b, p7/z, [a_ptr6]\n"
-                    "udot z27.s, z16.b, z3.b[0]\n"
+                    "mov z31.s, #0\n"
                     "ld1rqb z7.b, p7/z, [a_ptr7]\n"
-                    "udot z25.s, z17.b, z1.b[1]\n"
-                    "st1w z29.s, p7, [c_ptr5]\n"
-                    "mov z29.s, #0\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
+                    "udot z24.s, z16.b, z0.b[0]\n"
+                    "udot z25.s, z16.b, z1.b[0]\n"
+                    "udot z26.s, z16.b, z2.b[0]\n"
+                    "udot z27.s, z16.b, z3.b[0]\n"
                     "udot z28.s, z16.b, z4.b[0]\n"
-                    "st1w z30.s, p7, [c_ptr6]\n"
-                    "mov z30.s, #0\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "udot z29.s, z16.b, z5.b[0]\n"
-                    "st1w z31.s, p7, [c_ptr7]\n"
-                    "mov z31.s, #0\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+                    "udot z24.s, z17.b, z0.b[1]\n"
+                    "udot z25.s, z17.b, z1.b[1]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
@@ -7458,23 +8933,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z23.b, z5.b[3]\n"
                     "udot z30.s, z23.b, z6.b[3]\n"
                     "udot z31.s, z23.b, z7.b[3]\n"
-                    "2:\n"
+                    "5:\n"
                     "st1w z24.s, p0, [%[c_ptr0]]\n"
                     "addvl %[c_ptr0], %[c_ptr0], #1\n"
                     "st1w z25.s, p0, [c_ptr1]\n"
-                    "addvl c_ptr1, c_ptr1, #1\n"
                     "st1w z26.s, p0, [c_ptr2]\n"
-                    "addvl c_ptr2, c_ptr2, #1\n"
                     "st1w z27.s, p0, [c_ptr3]\n"
-                    "addvl c_ptr3, c_ptr3, #1\n"
                     "st1w z28.s, p0, [c_ptr4]\n"
-                    "addvl c_ptr4, c_ptr4, #1\n"
                     "st1w z29.s, p0, [c_ptr5]\n"
-                    "addvl c_ptr5, c_ptr5, #1\n"
                     "st1w z30.s, p0, [c_ptr6]\n"
-                    "addvl c_ptr6, c_ptr6, #1\n"
                     "st1w z31.s, p0, [c_ptr7]\n"
-                    "addvl c_ptr7, c_ptr7, #1\n"
                     ".unreq a_ptr1\n"
                     ".unreq a_ptr2\n"
                     ".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
index bea455ca67..8fdd2c920d 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
@@ -22,6 +22,9 @@
  * SOFTWARE.
  */
 #pragma once
+#if (defined(__GNUC__) && (__GNUC__ >= 7))
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+#endif
 
 #ifdef __arm__
 
diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
index eec842d09f..fdb4f584d8 100644
--- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
@@ -110,7 +110,7 @@ class QuantizeWrapper : public GemmCommon<To, Tr> {
     QuantizeWrapper operator=(const QuantizeWrapper &) = delete;
 
     QuantizeWrapper(const GemmArgs &args, const Requantize32 &qp) : _params(qp), _args(args), _barrier(args._maxthreads) {
-        GemmArgs newargs = GemmArgs(args._ci, args._Msize, args._Nsize, args._Ksize, args._nbatches, args._nmulti, Activation(), args._maxthreads, nullptr);
+        GemmArgs newargs = GemmArgs(args._ci, args._Msize, args._Nsize, args._Ksize, args._Ksections, args._nbatches, args._nmulti, args._indirect_input, Activation(), args._maxthreads);
         _subgemm = gemm<To, Tgemm>(newargs);
 
         if (_subgemm == nullptr) {
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp
index e50dca7f1f..111d01ed3a 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.cpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp
@@ -55,15 +55,16 @@ namespace {
  * column is set up in any case (and it is hoped that the compiler can elide
  * the needless movs in the per-layer case).
  */
-template<bool do_shift_correction, bool per_channel>
+template<bool do_shift_correction, bool per_channel, bool do_left_shift>
 void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigned int height,
                              const int32_t *input, unsigned int in_stride, int8_t *output, unsigned int out_stride,
                              const int32_t *row_bias, const int32_t *col_bias, const unsigned int start_col) {
-    const int32x4_t v_mul      = vdupq_n_s32(qp.per_layer_mul);
-    const int32x4_t v_shift    = vdupq_n_s32(qp.per_layer_shift);
-    const int32x4_t v_minval   = vdupq_n_s32(qp.minval);
-    const int32x4_t v_maxval   = vdupq_n_s32(qp.maxval);
-    const int32x4_t v_c_offset = vdupq_n_s32(qp.c_offset);
+    const int32x4_t v_mul          = vdupq_n_s32(qp.per_layer_mul);
+    const int32x4_t v_right_shift  = vdupq_n_s32(qp.per_layer_right_shift);
+    const int32x4_t v_left_shift   = vdupq_n_s32(qp.per_layer_left_shift);
+    const int32x4_t v_minval       = vdupq_n_s32(qp.minval);
+    const int32x4_t v_maxval       = vdupq_n_s32(qp.maxval);
+    const int32x4_t v_c_offset     = vdupq_n_s32(qp.c_offset);
 
     /* To make sure we have plenty of accumulators, compute two rows at a
      * time.  If the number of rows is odd, compute the bottom row twice to
@@ -77,8 +78,9 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
         unsigned int odds=(width % 4);
 
         const int32_t *colptr = col_bias;
-        const int32_t *perch_mul_ptr   = qp.per_channel_muls + start_col;
-        const int32_t *perch_shift_ptr = qp.per_channel_shifts + start_col;
+        const int32_t *perch_mul_ptr    = qp.per_channel_muls + start_col;
+        const int32_t *perch_shift_ptr  = qp.per_channel_right_shifts + start_col;
+        const int32_t *perch_shiftl_ptr = qp.per_channel_left_shifts + start_col;
 
         const int32_t *in_ptr = input + (row * in_stride);
         int8_t *out_ptr = output + (row * out_stride);
@@ -112,6 +114,11 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
             int32x4_t v_shf2;
             int32x4_t v_shf3;
 
+            int32x4_t v_shf0l;
+            int32x4_t v_shf1l;
+            int32x4_t v_shf2l;
+            int32x4_t v_shf3l;
+
             if (per_channel) {
                 v_mul0 = vld1q_s32(perch_mul_ptr);
                 v_mul1 = vld1q_s32(perch_mul_ptr + 4);
@@ -124,9 +131,18 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
                 v_shf2 = vld1q_s32(perch_shift_ptr + 8);
                 v_shf3 = vld1q_s32(perch_shift_ptr + 12);
                 perch_shift_ptr += 16;
+
+                if (do_left_shift) {
+                    v_shf0l = vld1q_s32(perch_shiftl_ptr);
+                    v_shf1l = vld1q_s32(perch_shiftl_ptr + 4);
+                    v_shf2l = vld1q_s32(perch_shiftl_ptr + 8);
+                    v_shf3l = vld1q_s32(perch_shiftl_ptr + 12);
+                    perch_shiftl_ptr += 16;
+                }
             } else {
                 v_mul0=v_mul1=v_mul2=v_mul3=v_mul;
-                v_shf0=v_shf1=v_shf2=v_shf3=v_shift;
+                v_shf0=v_shf1=v_shf2=v_shf3=v_right_shift;
+                v_shf0l=v_shf1l=v_shf2l=v_shf3l=v_left_shift;
             }
 
             // Load column pointers
@@ -171,7 +187,22 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
             v_in12 = vaddq_s32(v_in12, v_col2);
             v_in13 = vaddq_s32(v_in13, v_col3);
 
-            // Quantize - start with multiply
+            // Quantize
+
+            // If a left shift is needed it needs to happen first.
+            if (do_left_shift) {
+                v_in00 = vrshlq_s32(v_in00, v_shf0l);
+                v_in01 = vrshlq_s32(v_in01, v_shf1l);
+                v_in02 = vrshlq_s32(v_in02, v_shf2l);
+                v_in03 = vrshlq_s32(v_in03, v_shf3l);
+
+                v_in10 = vrshlq_s32(v_in10, v_shf0l);
+                v_in11 = vrshlq_s32(v_in11, v_shf1l);
+                v_in12 = vrshlq_s32(v_in12, v_shf2l);
+                v_in13 = vrshlq_s32(v_in13, v_shf3l);
+            }
+
+            // Multiply
             v_in00 = vqrdmulhq_s32(v_in00, v_mul0);
             v_in01 = vqrdmulhq_s32(v_in01, v_mul1);
             v_in02 = vqrdmulhq_s32(v_in02, v_mul2);
@@ -270,9 +301,183 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
             out_ptr1 += 16;
         }
 
+        // We are often quantizing one block of interleaved kernel output at a time - these are three registers
+        // wide.  Special case that here.
+        if (regs==3) {
+            regs -= 3;
+
+            int32x4_t v_mul0;
+            int32x4_t v_mul1;
+            int32x4_t v_mul2;
+
+            int32x4_t v_shf0;
+            int32x4_t v_shf1;
+            int32x4_t v_shf2;
+
+            int32x4_t v_shf0l;
+            int32x4_t v_shf1l;
+            int32x4_t v_shf2l;
+
+            if (per_channel) {
+                v_mul0 = vld1q_s32(perch_mul_ptr);
+                v_mul1 = vld1q_s32(perch_mul_ptr + 4);
+                v_mul2 = vld1q_s32(perch_mul_ptr + 8);
+                perch_mul_ptr += 12;
+
+                v_shf0 = vld1q_s32(perch_shift_ptr);
+                v_shf1 = vld1q_s32(perch_shift_ptr + 4);
+                v_shf2 = vld1q_s32(perch_shift_ptr + 8);
+                perch_shift_ptr += 12;
+
+                if (do_left_shift) {
+                    v_shf0l = vld1q_s32(perch_shiftl_ptr);
+                    v_shf1l = vld1q_s32(perch_shiftl_ptr + 4);
+                    v_shf2l = vld1q_s32(perch_shiftl_ptr + 8);
+                    perch_shiftl_ptr += 12;
+                }
+            } else {
+                v_mul0=v_mul1=v_mul2=v_mul;
+                v_shf0=v_shf1=v_shf2=v_right_shift;
+                v_shf0l=v_shf1l=v_shf2l=v_left_shift;
+            }
+
+            // Load column pointers
+            int32x4_t v_col0 = vld1q_s32(colptr);
+            int32x4_t v_col1 = vld1q_s32(colptr + 4);
+            int32x4_t v_col2 = vld1q_s32(colptr + 8);
+            colptr += 12;
+
+            // Load input data (row 0);
+            int32x4_t v_in00 = vld1q_s32(in_ptr);
+            int32x4_t v_in01 = vld1q_s32(in_ptr + 4);
+            int32x4_t v_in02 = vld1q_s32(in_ptr + 8);
+            in_ptr += 12;
+
+            // Load input data (row 1);
+            int32x4_t v_in10 = vld1q_s32(in_ptr1);
+            int32x4_t v_in11 = vld1q_s32(in_ptr1 + 4);
+            int32x4_t v_in12 = vld1q_s32(in_ptr1 + 8);
+            in_ptr1 += 12;
+
+            // Add on row bias and column bias
+            v_in00 = vaddq_s32(v_in00, v_row_sum);
+            v_in01 = vaddq_s32(v_in01, v_row_sum);
+            v_in02 = vaddq_s32(v_in02, v_row_sum);
+
+            v_in10 = vaddq_s32(v_in10, v_row_sum1);
+            v_in11 = vaddq_s32(v_in11, v_row_sum1);
+            v_in12 = vaddq_s32(v_in12, v_row_sum1);
+
+            v_in00 = vaddq_s32(v_in00, v_col0);
+            v_in01 = vaddq_s32(v_in01, v_col1);
+            v_in02 = vaddq_s32(v_in02, v_col2);
+
+            v_in10 = vaddq_s32(v_in10, v_col0);
+            v_in11 = vaddq_s32(v_in11, v_col1);
+            v_in12 = vaddq_s32(v_in12, v_col2);
+
+            // Quantize
+
+            // If a left shift is needed it needs to happen first.
+            if (do_left_shift) {
+                v_in00 = vrshlq_s32(v_in00, v_shf0l);
+                v_in01 = vrshlq_s32(v_in01, v_shf1l);
+                v_in02 = vrshlq_s32(v_in02, v_shf2l);
+
+                v_in10 = vrshlq_s32(v_in10, v_shf0l);
+                v_in11 = vrshlq_s32(v_in11, v_shf1l);
+                v_in12 = vrshlq_s32(v_in12, v_shf2l);
+            }
+
+            // Multiply
+            v_in00 = vqrdmulhq_s32(v_in00, v_mul0);
+            v_in01 = vqrdmulhq_s32(v_in01, v_mul1);
+            v_in02 = vqrdmulhq_s32(v_in02, v_mul2);
+
+            v_in10 = vqrdmulhq_s32(v_in10, v_mul0);
+            v_in11 = vqrdmulhq_s32(v_in11, v_mul1);
+            v_in12 = vqrdmulhq_s32(v_in12, v_mul2);
+
+            // Compute and add on corrective offset
+            if (do_shift_correction) {
+                int32x4_t v_temp00 = vandq_s32(v_in00, v_shf0);
+                int32x4_t v_temp01 = vandq_s32(v_in01, v_shf1);
+                int32x4_t v_temp02 = vandq_s32(v_in02, v_shf2);
+
+                int32x4_t v_temp10 = vandq_s32(v_in10, v_shf0);
+                int32x4_t v_temp11 = vandq_s32(v_in11, v_shf1);
+                int32x4_t v_temp12 = vandq_s32(v_in12, v_shf2);
+
+                v_temp00 = vshrq_n_s32(v_temp00, 31);
+                v_temp01 = vshrq_n_s32(v_temp01, 31);
+                v_temp02 = vshrq_n_s32(v_temp02, 31);
+
+                v_temp10 = vshrq_n_s32(v_temp10, 31);
+                v_temp11 = vshrq_n_s32(v_temp11, 31);
+                v_temp12 = vshrq_n_s32(v_temp12, 31);
+
+                v_in00 = vqaddq_s32(v_in00, v_temp00);
+                v_in01 = vqaddq_s32(v_in01, v_temp01);
+                v_in02 = vqaddq_s32(v_in02, v_temp02);
+
+                v_in10 = vqaddq_s32(v_in10, v_temp10);
+                v_in11 = vqaddq_s32(v_in11, v_temp11);
+                v_in12 = vqaddq_s32(v_in12, v_temp12);
+            }
+
+            v_in00 = vrshlq_s32(v_in00, v_shf0);
+            v_in01 = vrshlq_s32(v_in01, v_shf1);
+            v_in02 = vrshlq_s32(v_in02, v_shf2);
+
+            v_in10 = vrshlq_s32(v_in10, v_shf0);
+            v_in11 = vrshlq_s32(v_in11, v_shf1);
+            v_in12 = vrshlq_s32(v_in12, v_shf2);
+
+            v_in00 = vaddq_s32(v_in00, v_c_offset);
+            v_in01 = vaddq_s32(v_in01, v_c_offset);
+            v_in02 = vaddq_s32(v_in02, v_c_offset);
+
+            v_in10 = vaddq_s32(v_in10, v_c_offset);
+            v_in11 = vaddq_s32(v_in11, v_c_offset);
+            v_in12 = vaddq_s32(v_in12, v_c_offset);
+
+            v_in00 = vmaxq_s32(v_in00, v_minval);
+            v_in01 = vmaxq_s32(v_in01, v_minval);
+            v_in02 = vmaxq_s32(v_in02, v_minval);
+
+            v_in10 = vmaxq_s32(v_in10, v_minval);
+            v_in11 = vmaxq_s32(v_in11, v_minval);
+            v_in12 = vmaxq_s32(v_in12, v_minval);
+
+            v_in00 = vminq_s32(v_in00, v_maxval);
+            v_in01 = vminq_s32(v_in01, v_maxval);
+            v_in02 = vminq_s32(v_in02, v_maxval);
+
+            v_in10 = vminq_s32(v_in10, v_maxval);
+            v_in11 = vminq_s32(v_in11, v_maxval);
+            v_in12 = vminq_s32(v_in12, v_maxval);
+
+            int16x8_t v_uz00 = vuzp1q_s16(vreinterpretq_s16_s32(v_in00), vreinterpretq_s16_s32(v_in01));
+            int16x8_t v_uz01 = vuzp1q_s16(vreinterpretq_s16_s32(v_in02), vreinterpretq_s16_s32(v_in02));
+
+            int16x8_t v_uz10 = vuzp1q_s16(vreinterpretq_s16_s32(v_in10), vreinterpretq_s16_s32(v_in11));
+            int16x8_t v_uz11 = vuzp1q_s16(vreinterpretq_s16_s32(v_in12), vreinterpretq_s16_s32(v_in12));
+
+            int8x16_t v_uz0 = vuzp1q_s8(vreinterpretq_s8_s16(v_uz00), vreinterpretq_s8_s16(v_uz01));
+            int8x16_t v_uz1 = vuzp1q_s8(vreinterpretq_s8_s16(v_uz10), vreinterpretq_s8_s16(v_uz11));
+
+            vst1q_lane_s64(reinterpret_cast<int64_t *>(out_ptr), vreinterpretq_s64_s8(v_uz0), 0);
+            vst1q_lane_s32(reinterpret_cast<int32_t *>(out_ptr + 8), vreinterpretq_s32_s8(v_uz0), 2);
+            out_ptr += 12;
+            vst1q_lane_s64(reinterpret_cast<int64_t *>(out_ptr1), vreinterpretq_s64_s8(v_uz1), 0);
+            vst1q_lane_s32(reinterpret_cast<int32_t *>(out_ptr1 + 8), vreinterpretq_s32_s8(v_uz1), 2);
+            out_ptr1 += 12;
+        }
+
         while (regs--) {
             int32x4_t v_mul0;
             int32x4_t v_shf0;
+            int32x4_t v_shf0l;
 
             if (per_channel) {
                 v_mul0 = vld1q_s32(perch_mul_ptr);
@@ -280,9 +485,15 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
 
                 v_shf0 = vld1q_s32(perch_shift_ptr);
                 perch_shift_ptr += 4;
+
+                if (do_left_shift) {
+                    v_shf0l = vld1q_s32(perch_shiftl_ptr);
+                    perch_shiftl_ptr += 4;
+                }
             } else {
                 v_mul0=v_mul;
-                v_shf0=v_shift;
+                v_shf0=v_right_shift;
+                v_shf0l=v_left_shift;
             }
             // Load column pointers
             int32x4_t v_col0 = vld1q_s32(colptr);
@@ -306,7 +517,14 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
 
             v_in10 = vaddq_s32(v_in10, v_col0);
 
-            // Quantize - start with multiply
+            // Quantize - start with (optional) left shift
+            if (do_left_shift) {
+                v_in00 = vrshlq_s32(v_in00, v_shf0l);
+
+                v_in10 = vrshlq_s32(v_in10, v_shf0l);
+            }
+
+            // Then multiply
             v_in00 = vqrdmulhq_s32(v_in00, v_mul0);
 
             v_in10 = vqrdmulhq_s32(v_in10, v_mul0);
@@ -358,10 +576,12 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
             int32x4_t v_in10 = vdupq_n_s32(0);
             int32x4_t v_mul0 = vdupq_n_s32(0);
             int32x4_t v_shf0 = vdupq_n_s32(0);
+            int32x4_t v_shf0l = vdupq_n_s32(0);
 
             if (!per_channel) {
                 v_mul0 = v_mul;
-                v_shf0 = v_shift;
+                v_shf0 = v_right_shift;
+                v_shf0l = v_left_shift;
             }
 
             do {
@@ -371,6 +591,9 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
                 if (per_channel) {
                     v_mul0 = vld1q_lane_s32(perch_mul_ptr, v_mul0, 0);
                     v_shf0 = vld1q_lane_s32(perch_shift_ptr, v_shf0, 0);
+                    if (do_left_shift) {
+                        v_shf0l = vld1q_lane_s32(perch_shiftl_ptr, v_shf0l, 0);
+                    }
                 }
                 if (odds == 1) { break; }
 
@@ -380,6 +603,9 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
                 if (per_channel) {
                     v_mul0 = vld1q_lane_s32(perch_mul_ptr + 1, v_mul0, 1);
                     v_shf0 = vld1q_lane_s32(perch_shift_ptr + 1, v_shf0, 1);
+                    if (do_left_shift) {
+                        v_shf0l = vld1q_lane_s32(perch_shiftl_ptr + 1, v_shf0l, 1);
+                    }
                 }
                 if (odds == 2) { break; }
 
@@ -389,6 +615,9 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
                 if (per_channel) {
                     v_mul0 = vld1q_lane_s32(perch_mul_ptr + 2, v_mul0, 2);
                     v_shf0 = vld1q_lane_s32(perch_shift_ptr + 2, v_shf0, 2);
+                    if (do_left_shift) {
+                        v_shf0l = vld1q_lane_s32(perch_shiftl_ptr + 2, v_shf0l, 2);
+                    }
                 }
             } while (0);
 
@@ -402,7 +631,14 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
 
             v_in10 = vaddq_s32(v_in10, v_col0);
 
-            // Quantize - start with multiply
+            // Quantize - start with (optional) left shift
+            if (do_left_shift) {
+                v_in00 = vrshlq_s32(v_in00, v_shf0l);
+
+                v_in10 = vrshlq_s32(v_in10, v_shf0l);
+            }
+
+            // Then multiply
             v_in00 = vqrdmulhq_s32(v_in00, v_mul0);
 
             v_in10 = vqrdmulhq_s32(v_in10, v_mul0);
@@ -464,19 +700,39 @@ void requantize_block_32(const Requantize32 &qp, unsigned int width, unsigned in
                          const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col) {
     if (qp.per_channel_requant) {
         if (qp.minval >= qp.c_offset) {
-            requantize_block_32_int<false, true>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
-                             reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
+            if (qp.per_channel_left_shifts) {
+                requantize_block_32_int<false, true, true>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
+                                 reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
+            } else {
+                requantize_block_32_int<false, true, false>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
+                                 reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
+            }
         } else {
-            requantize_block_32_int<true, true>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
-                             reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
+            if (qp.per_channel_left_shifts) {
+                requantize_block_32_int<true, true, true>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
+                                 reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
+            } else {
+                requantize_block_32_int<true, true, false>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
+                                 reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
+            }
         }
     } else {
         if (qp.minval >= qp.c_offset) {
-            requantize_block_32_int<false, false>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
-                             reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
+            if (qp.per_layer_left_shift > 0) {
+                requantize_block_32_int<false, false, true>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
+                                 reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
+            } else {
+                requantize_block_32_int<false, false, false>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
+                                 reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
+            }
         } else {
-            requantize_block_32_int<true, false>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
-                             reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
+            if (qp.per_layer_left_shift > 0) {
+                requantize_block_32_int<true, false, true>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
+                                 reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
+            } else {
+                requantize_block_32_int<true, false, false>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
+                                 reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
+            }
         }
     }
 }
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.hpp b/src/core/NEON/kernels/arm_gemm/quantized.hpp
index b0e0c3b580..3f3443025c 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.hpp
@@ -23,6 +23,8 @@
  */
 #pragma once
 
+#include "utils.hpp" // IndirectInputArg
+
 namespace arm_gemm {
 
 template<typename Tin, typename Tout>
@@ -39,4 +41,8 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
                       const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth,
                       unsigned int multi, unsigned int first_col);
 
+template<typename T>
+void row_sums_indirect(unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<T> A_arg,
+                       size_t M, int32_t *output_ptr, const Requantize32 *qp);
+
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
new file mode 100644
index 0000000000..5433676558
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
@@ -0,0 +1,1160 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "quantized.hpp"
+#include "utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+template<>
+void row_sums_indirect(
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, int32_t *out_ptr, const Requantize32 *qp
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings;
+        const unsigned int *string_lengths;
+        unsigned int input_initial_col;
+    } ka;
+
+    unsigned long flags=0;
+    void *input_ptr;
+    size_t input_offset;
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        input_offset=A_arg.direct.stride;
+    }
+
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+
+    __asm__ __volatile__(
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v2.4s }, [x19]\n"
+      "neg v2.4s, v2.4s\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 86f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 69f\n"
+      "beq 52f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 35f\n"
+      "beq 18f\n"
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "movi v0.4s, #0x0\n"
+      "mov x9, #0x0\n"
+      "mov x28, #0x0\n"
+      "2:"  // Height 1: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 3f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "cbnz x28, 4f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "b 4f\n"
+      "3:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "4:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "blt 8f\n"
+      "cmp x27, #0x20\n"
+      "blt 7f\n"
+      "5:"  // Height 1: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "blt 6f\n"
+      "sadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "6:"  // Height 1: Multiply loop: unique 1: no collapse
+      "sadalp v1.8h, v31.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 5b\n"
+      "7:"  // Height 1: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "sadalp v1.8h, v31.16b\n"
+      "8:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 17f\n"
+      "tbz x27, #3, 12f\n"
+      "ldr d31, [x26], #0x8\n"
+      "tbz x27, #2, 10f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "tbz x27, #1, 9f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "b 16f\n"
+      "9:"  // Height 1: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "b 16f\n"
+      "10:"  // Height 1: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 11f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "b 16f\n"
+      "11:"  // Height 1: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "b 16f\n"
+      "12:"  // Height 1: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 14f\n"
+      "ldr s31, [x26], #0x4\n"
+      "tbz x27, #1, 13f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "b 16f\n"
+      "13:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "b 16f\n"
+      "14:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 15f\n"
+      "ldr h31, [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "b 16f\n"
+      "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "16:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "sadalp v1.8h, v31.16b\n"
+      "17:"  // Height 1: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 2b\n"
+      "sadalp v0.4s, v1.8h\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "str s0, [%x[out_ptr]], #0x4\n"
+      "b 104f\n"
+      "18:"  // Height 2
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "19:"  // Height 2: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "cbnz x28, 21f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "b 21f\n"
+      "20:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "21:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "blt 25f\n"
+      "cmp x27, #0x20\n"
+      "blt 24f\n"
+      "22:"  // Height 2: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "blt 23f\n"
+      "sadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "sadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "23:"  // Height 2: Multiply loop: unique 2: no collapse
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 22b\n"
+      "24:"  // Height 2: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "25:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 34f\n"
+      "tbz x27, #3, 29f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "tbz x27, #2, 27f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "tbz x27, #1, 26f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "b 33f\n"
+      "26:"  // Height 2: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "b 33f\n"
+      "27:"  // Height 2: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 28f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "b 33f\n"
+      "28:"  // Height 2: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "b 33f\n"
+      "29:"  // Height 2: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 31f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "tbz x27, #1, 30f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "b 33f\n"
+      "30:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "b 33f\n"
+      "31:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 32f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "b 33f\n"
+      "32:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "33:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "34:"  // Height 2: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 19b\n"
+      "sadalp v0.4s, v1.8h\n"
+      "sadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "str d0, [%x[out_ptr]], #0x8\n"
+      "b 104f\n"
+      "35:"  // Height 3
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "36:"  // Height 3: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 37f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "cbnz x28, 38f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "b 38f\n"
+      "37:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "38:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "blt 42f\n"
+      "cmp x27, #0x20\n"
+      "blt 41f\n"
+      "39:"  // Height 3: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "blt 40f\n"
+      "sadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "sadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "sadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "40:"  // Height 3: Multiply loop: unique 3: no collapse
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 39b\n"
+      "41:"  // Height 3: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "add x24, x24, #0x10\n"
+      "42:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 51f\n"
+      "tbz x27, #3, 46f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "tbz x27, #2, 44f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "tbz x27, #1, 43f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "b 50f\n"
+      "43:"  // Height 3: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "b 50f\n"
+      "44:"  // Height 3: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 45f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "b 50f\n"
+      "45:"  // Height 3: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "b 50f\n"
+      "46:"  // Height 3: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 48f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "tbz x27, #1, 47f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "b 50f\n"
+      "47:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "b 50f\n"
+      "48:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 49f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "b 50f\n"
+      "49:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "50:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "51:"  // Height 3: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 36b\n"
+      "sadalp v0.4s, v1.8h\n"
+      "sadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "sadalp v26.4s, v27.8h\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "addp v26.4s, v26.4s, v26.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "str d0, [%x[out_ptr]], #0x8\n"
+      "addp v26.4s, v26.4s, v26.4s\n"
+      "mul v26.4s, v26.4s, v2.4s\n"
+      "str s26, [%x[out_ptr]], #0x4\n"
+      "b 104f\n"
+      "52:"  // Height 4
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v24.8h, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "53:"  // Height 4: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 54f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "ldr x23, [x19, #0x18]\n"
+      "cbnz x28, 55f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "b 55f\n"
+      "54:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "add x23, x24, %x[input_offset]\n"
+      "55:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "blt 59f\n"
+      "cmp x27, #0x20\n"
+      "blt 58f\n"
+      "56:"  // Height 4: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "blt 57f\n"
+      "sadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "sadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "sadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "sadalp v23.4s, v24.8h\n"
+      "movi v24.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "57:"  // Height 4: Multiply loop: unique 4: no collapse
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 56b\n"
+      "58:"  // Height 4: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "59:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 68f\n"
+      "tbz x27, #3, 63f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "tbz x27, #2, 61f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "tbz x27, #1, 60f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "b 67f\n"
+      "60:"  // Height 4: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "b 67f\n"
+      "61:"  // Height 4: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 62f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "b 67f\n"
+      "62:"  // Height 4: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "b 67f\n"
+      "63:"  // Height 4: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 65f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "tbz x27, #1, 64f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "b 67f\n"
+      "64:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "b 67f\n"
+      "65:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 66f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "b 67f\n"
+      "66:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "67:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "68:"  // Height 4: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 53b\n"
+      "sadalp v0.4s, v1.8h\n"
+      "sadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "sadalp v26.4s, v27.8h\n"
+      "sadalp v23.4s, v24.8h\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+      "b 104f\n"
+      "69:"  // Height 5
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v24.8h, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v21.8h, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "70:"  // Height 5: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 71f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "ldr x23, [x19, #0x18]\n"
+      "ldr x22, [x19, #0x20]\n"
+      "cbnz x28, 72f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 72f\n"
+      "71:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "add x23, x24, %x[input_offset]\n"
+      "add x22, x23, %x[input_offset]\n"
+      "72:"  // Height 5: input setup done
+      "cmp x27, #0x10\n"
+      "blt 76f\n"
+      "cmp x27, #0x20\n"
+      "blt 75f\n"
+      "73:"  // Height 5: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "blt 74f\n"
+      "sadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "sadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "sadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "sadalp v23.4s, v24.8h\n"
+      "movi v24.8h, #0x0\n"
+      "sadalp v20.4s, v21.8h\n"
+      "movi v21.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "74:"  // Height 5: Multiply loop: unique 5: no collapse
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "sadalp v21.8h, v19.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 73b\n"
+      "75:"  // Height 5: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "sadalp v21.8h, v19.16b\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "76:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 85f\n"
+      "tbz x27, #3, 80f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "tbz x27, #2, 78f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "tbz x27, #1, 77f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v19.h }[6], [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v19.b }[14], [x22]\n"
+      "b 84f\n"
+      "77:"  // Height 5: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v19.b }[12], [x22]\n"
+      "b 84f\n"
+      "78:"  // Height 5: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 79f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v19.h }[4], [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v19.b }[10], [x22]\n"
+      "b 84f\n"
+      "79:"  // Height 5: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v19.b }[8], [x22]\n"
+      "b 84f\n"
+      "80:"  // Height 5: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 82f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "tbz x27, #1, 81f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v19.h }[2], [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v19.b }[6], [x22]\n"
+      "b 84f\n"
+      "81:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v19.b }[4], [x22]\n"
+      "b 84f\n"
+      "82:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 83f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h19, [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v19.b }[2], [x22]\n"
+      "b 84f\n"
+      "83:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b19, [x22, #0x0]\n"
+      "84:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "sadalp v21.8h, v19.16b\n"
+      "85:"  // Height 5: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 70b\n"
+      "sadalp v0.4s, v1.8h\n"
+      "sadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "sadalp v26.4s, v27.8h\n"
+      "sadalp v23.4s, v24.8h\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "sadalp v20.4s, v21.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "mul v20.4s, v20.4s, v2.4s\n"
+      "str s20, [%x[out_ptr]], #0x4\n"
+      "b 104f\n"
+      "86:"  // Height 6
+      "movi v1.8h, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v24.8h, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v21.8h, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v18.8h, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "87:"  // Height 6: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 88f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "ldr x23, [x19, #0x18]\n"
+      "ldr x22, [x19, #0x20]\n"
+      "ldr x20, [x19, #0x28]\n"
+      "cbnz x28, 89f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 89f\n"
+      "88:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "add x23, x24, %x[input_offset]\n"
+      "add x22, x23, %x[input_offset]\n"
+      "add x20, x22, %x[input_offset]\n"
+      "89:"  // Height 6: input setup done
+      "cmp x27, #0x10\n"
+      "blt 93f\n"
+      "cmp x27, #0x20\n"
+      "blt 92f\n"
+      "90:"  // Height 6: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "ldr q16, [x20, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "add x20, x20, #0x10\n"
+      "blt 91f\n"
+      "sadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "sadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "sadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "sadalp v23.4s, v24.8h\n"
+      "movi v24.8h, #0x0\n"
+      "sadalp v20.4s, v21.8h\n"
+      "movi v21.8h, #0x0\n"
+      "sadalp v17.4s, v18.8h\n"
+      "movi v18.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "91:"  // Height 6: Multiply loop: unique 6: no collapse
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "sadalp v21.8h, v19.16b\n"
+      "sadalp v18.8h, v16.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 90b\n"
+      "92:"  // Height 6: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "ldr q16, [x20, #0x0]\n"
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "sadalp v21.8h, v19.16b\n"
+      "sadalp v18.8h, v16.16b\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "add x20, x20, #0x10\n"
+      "93:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 102f\n"
+      "tbz x27, #3, 97f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "tbz x27, #2, 95f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "tbz x27, #1, 94f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v19.h }[6], [x22], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v19.b }[14], [x22]\n"
+      "ld1 { v16.b }[14], [x20]\n"
+      "b 101f\n"
+      "94:"  // Height 6: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v19.b }[12], [x22]\n"
+      "ld1 { v16.b }[12], [x20]\n"
+      "b 101f\n"
+      "95:"  // Height 6: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 96f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v19.h }[4], [x22], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v19.b }[10], [x22]\n"
+      "ld1 { v16.b }[10], [x20]\n"
+      "b 101f\n"
+      "96:"  // Height 6: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v19.b }[8], [x22]\n"
+      "ld1 { v16.b }[8], [x20]\n"
+      "b 101f\n"
+      "97:"  // Height 6: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 99f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "tbz x27, #1, 98f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v19.h }[2], [x22], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v19.b }[6], [x22]\n"
+      "ld1 { v16.b }[6], [x20]\n"
+      "b 101f\n"
+      "98:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v19.b }[4], [x22]\n"
+      "ld1 { v16.b }[4], [x20]\n"
+      "b 101f\n"
+      "99:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 100f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h19, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v19.b }[2], [x22]\n"
+      "ld1 { v16.b }[2], [x20]\n"
+      "b 101f\n"
+      "100:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b19, [x22, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
+      "101:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "sadalp v1.8h, v31.16b\n"
+      "sadalp v30.8h, v28.16b\n"
+      "sadalp v27.8h, v25.16b\n"
+      "sadalp v24.8h, v22.16b\n"
+      "sadalp v21.8h, v19.16b\n"
+      "sadalp v18.8h, v16.16b\n"
+      "102:"  // Height 6: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x21\n"
+      "bne 87b\n"
+      "sadalp v0.4s, v1.8h\n"
+      "sadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "sadalp v26.4s, v27.8h\n"
+      "sadalp v23.4s, v24.8h\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "sadalp v20.4s, v21.8h\n"
+      "sadalp v17.4s, v18.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "addp v20.4s, v20.4s, v17.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "mul v20.4s, v20.4s, v2.4s\n"
+      "str d20, [%x[out_ptr]], #0x8\n"
+      "beq 104f\n"
+      "tbz %x[flags], #3, 103f\n"
+      "add %x[input_offset], %x[input_offset], #0x6\n"
+      "b 1b\n"
+      "103:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n"
+      "b 1b\n"
+      "104:"  // Exit
+
+      : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
new file mode 100644
index 0000000000..f5709d92ac
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
@@ -0,0 +1,1160 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "quantized.hpp"
+#include "utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+template<>
+void row_sums_indirect(
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, int32_t *out_ptr, const Requantize32 *qp
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings;
+        const unsigned int *string_lengths;
+        unsigned int input_initial_col;
+    } ka;
+
+    unsigned long flags=0;
+    void *input_ptr;
+    size_t input_offset;
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        input_offset=A_arg.direct.stride;
+    }
+
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+
+    __asm__ __volatile__(
+      "add x19, %x[qp], %[b_offset]\n"
+      "ld1r { v2.4s }, [x19]\n"
+      "neg v2.4s, v2.4s\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 86f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 69f\n"
+      "beq 52f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 35f\n"
+      "beq 18f\n"
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "movi v0.4s, #0x0\n"
+      "mov x9, #0x0\n"
+      "mov x28, #0x0\n"
+      "2:"  // Height 1: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 3f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "cbnz x28, 4f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "b 4f\n"
+      "3:"  // Height 1: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "4:"  // Height 1: input setup done
+      "cmp x27, #0x10\n"
+      "blt 8f\n"
+      "cmp x27, #0x20\n"
+      "blt 7f\n"
+      "5:"  // Height 1: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "blt 6f\n"
+      "uadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "6:"  // Height 1: Multiply loop: unique 1: no collapse
+      "uadalp v1.8h, v31.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 5b\n"
+      "7:"  // Height 1: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "uadalp v1.8h, v31.16b\n"
+      "8:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x27, 17f\n"
+      "tbz x27, #3, 12f\n"
+      "ldr d31, [x26], #0x8\n"
+      "tbz x27, #2, 10f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "tbz x27, #1, 9f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "b 16f\n"
+      "9:"  // Height 1: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "b 16f\n"
+      "10:"  // Height 1: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 11f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "b 16f\n"
+      "11:"  // Height 1: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "b 16f\n"
+      "12:"  // Height 1: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 14f\n"
+      "ldr s31, [x26], #0x4\n"
+      "tbz x27, #1, 13f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "b 16f\n"
+      "13:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "b 16f\n"
+      "14:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 15f\n"
+      "ldr h31, [x26], #0x2\n"
+      "tbz x27, #0, 16f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "b 16f\n"
+      "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "16:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "uadalp v1.8h, v31.16b\n"
+      "17:"  // Height 1: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 2b\n"
+      "uadalp v0.4s, v1.8h\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "str s0, [%x[out_ptr]], #0x4\n"
+      "b 104f\n"
+      "18:"  // Height 2
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "19:"  // Height 2: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "cbnz x28, 21f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "b 21f\n"
+      "20:"  // Height 2: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "21:"  // Height 2: input setup done
+      "cmp x27, #0x10\n"
+      "blt 25f\n"
+      "cmp x27, #0x20\n"
+      "blt 24f\n"
+      "22:"  // Height 2: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "blt 23f\n"
+      "uadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "uadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "23:"  // Height 2: Multiply loop: unique 2: no collapse
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 22b\n"
+      "24:"  // Height 2: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "25:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x27, 34f\n"
+      "tbz x27, #3, 29f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "tbz x27, #2, 27f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "tbz x27, #1, 26f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "b 33f\n"
+      "26:"  // Height 2: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "b 33f\n"
+      "27:"  // Height 2: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 28f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "b 33f\n"
+      "28:"  // Height 2: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "b 33f\n"
+      "29:"  // Height 2: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 31f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "tbz x27, #1, 30f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "b 33f\n"
+      "30:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "b 33f\n"
+      "31:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 32f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "tbz x27, #0, 33f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "b 33f\n"
+      "32:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "33:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "34:"  // Height 2: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 19b\n"
+      "uadalp v0.4s, v1.8h\n"
+      "uadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "str d0, [%x[out_ptr]], #0x8\n"
+      "b 104f\n"
+      "35:"  // Height 3
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "36:"  // Height 3: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 37f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "cbnz x28, 38f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "b 38f\n"
+      "37:"  // Height 3: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "38:"  // Height 3: input setup done
+      "cmp x27, #0x10\n"
+      "blt 42f\n"
+      "cmp x27, #0x20\n"
+      "blt 41f\n"
+      "39:"  // Height 3: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "blt 40f\n"
+      "uadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "uadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "uadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "40:"  // Height 3: Multiply loop: unique 3: no collapse
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 39b\n"
+      "41:"  // Height 3: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "add x24, x24, #0x10\n"
+      "42:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x27, 51f\n"
+      "tbz x27, #3, 46f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "tbz x27, #2, 44f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "tbz x27, #1, 43f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "b 50f\n"
+      "43:"  // Height 3: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "b 50f\n"
+      "44:"  // Height 3: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 45f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "b 50f\n"
+      "45:"  // Height 3: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "b 50f\n"
+      "46:"  // Height 3: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 48f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "tbz x27, #1, 47f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "b 50f\n"
+      "47:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "b 50f\n"
+      "48:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 49f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "tbz x27, #0, 50f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "b 50f\n"
+      "49:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "50:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "51:"  // Height 3: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 36b\n"
+      "uadalp v0.4s, v1.8h\n"
+      "uadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "uadalp v26.4s, v27.8h\n"
+      "addp v0.4s, v0.4s, v0.4s\n"
+      "addp v26.4s, v26.4s, v26.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "str d0, [%x[out_ptr]], #0x8\n"
+      "addp v26.4s, v26.4s, v26.4s\n"
+      "mul v26.4s, v26.4s, v2.4s\n"
+      "str s26, [%x[out_ptr]], #0x4\n"
+      "b 104f\n"
+      "52:"  // Height 4
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v24.8h, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "53:"  // Height 4: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 54f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "ldr x23, [x19, #0x18]\n"
+      "cbnz x28, 55f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "b 55f\n"
+      "54:"  // Height 4: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "add x23, x24, %x[input_offset]\n"
+      "55:"  // Height 4: input setup done
+      "cmp x27, #0x10\n"
+      "blt 59f\n"
+      "cmp x27, #0x20\n"
+      "blt 58f\n"
+      "56:"  // Height 4: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "blt 57f\n"
+      "uadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "uadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "uadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "uadalp v23.4s, v24.8h\n"
+      "movi v24.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "57:"  // Height 4: Multiply loop: unique 4: no collapse
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 56b\n"
+      "58:"  // Height 4: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "59:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x27, 68f\n"
+      "tbz x27, #3, 63f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "tbz x27, #2, 61f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "tbz x27, #1, 60f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "b 67f\n"
+      "60:"  // Height 4: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "b 67f\n"
+      "61:"  // Height 4: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 62f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "b 67f\n"
+      "62:"  // Height 4: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "b 67f\n"
+      "63:"  // Height 4: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 65f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "tbz x27, #1, 64f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "b 67f\n"
+      "64:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "b 67f\n"
+      "65:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 66f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "tbz x27, #0, 67f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "b 67f\n"
+      "66:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "67:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "68:"  // Height 4: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 53b\n"
+      "uadalp v0.4s, v1.8h\n"
+      "uadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "uadalp v26.4s, v27.8h\n"
+      "uadalp v23.4s, v24.8h\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+      "b 104f\n"
+      "69:"  // Height 5
+      "movi v1.8h, #0x0\n"
+      "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v24.8h, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v21.8h, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "70:"  // Height 5: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 71f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "ldr x23, [x19, #0x18]\n"
+      "ldr x22, [x19, #0x20]\n"
+      "cbnz x28, 72f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 72f\n"
+      "71:"  // Height 5: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "add x23, x24, %x[input_offset]\n"
+      "add x22, x23, %x[input_offset]\n"
+      "72:"  // Height 5: input setup done
+      "cmp x27, #0x10\n"
+      "blt 76f\n"
+      "cmp x27, #0x20\n"
+      "blt 75f\n"
+      "73:"  // Height 5: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "blt 74f\n"
+      "uadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "uadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "uadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "uadalp v23.4s, v24.8h\n"
+      "movi v24.8h, #0x0\n"
+      "uadalp v20.4s, v21.8h\n"
+      "movi v21.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "74:"  // Height 5: Multiply loop: unique 5: no collapse
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "uadalp v21.8h, v19.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 73b\n"
+      "75:"  // Height 5: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "uadalp v21.8h, v19.16b\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "76:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x27, 85f\n"
+      "tbz x27, #3, 80f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "tbz x27, #2, 78f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "tbz x27, #1, 77f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v19.h }[6], [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v19.b }[14], [x22]\n"
+      "b 84f\n"
+      "77:"  // Height 5: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v19.b }[12], [x22]\n"
+      "b 84f\n"
+      "78:"  // Height 5: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 79f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v19.h }[4], [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v19.b }[10], [x22]\n"
+      "b 84f\n"
+      "79:"  // Height 5: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v19.b }[8], [x22]\n"
+      "b 84f\n"
+      "80:"  // Height 5: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 82f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "tbz x27, #1, 81f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v19.h }[2], [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v19.b }[6], [x22]\n"
+      "b 84f\n"
+      "81:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v19.b }[4], [x22]\n"
+      "b 84f\n"
+      "82:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 83f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h19, [x22], #0x2\n"
+      "tbz x27, #0, 84f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v19.b }[2], [x22]\n"
+      "b 84f\n"
+      "83:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b19, [x22, #0x0]\n"
+      "84:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "uadalp v21.8h, v19.16b\n"
+      "85:"  // Height 5: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x20\n"
+      "bne 70b\n"
+      "uadalp v0.4s, v1.8h\n"
+      "uadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "uadalp v26.4s, v27.8h\n"
+      "uadalp v23.4s, v24.8h\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "uadalp v20.4s, v21.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "mul v20.4s, v20.4s, v2.4s\n"
+      "str s20, [%x[out_ptr]], #0x4\n"
+      "b 104f\n"
+      "86:"  // Height 6
+      "movi v1.8h, #0x0\n"
+      "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "mov x9, #0x0\n"
+      "movi v0.4s, #0x0\n"
+      "mov x28, #0x0\n"
+      "movi v30.8h, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v27.8h, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v24.8h, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v21.8h, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v18.8h, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "87:"  // Height 6: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w27, [x19, x28, LSL #0x2]\n"
+      "tbz %x[flags], #3, 88f\n"
+      "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+      "add x19, x19, %x[input_offset], LSL #3\n"
+      "ldr x26, [x19, #0x0]\n"
+      "ldr x25, [x19, #0x8]\n"
+      "ldr x24, [x19, #0x10]\n"
+      "ldr x23, [x19, #0x18]\n"
+      "ldr x22, [x19, #0x20]\n"
+      "ldr x20, [x19, #0x28]\n"
+      "cbnz x28, 89f\n"
+      "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x26, x26, x19\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x20, x20, x19\n"
+      "b 89f\n"
+      "88:"  // Height 6: setup direct input
+      "mov x26, %x[input_ptr]\n"
+      "add x25, x26, %x[input_offset]\n"
+      "add x24, x25, %x[input_offset]\n"
+      "add x23, x24, %x[input_offset]\n"
+      "add x22, x23, %x[input_offset]\n"
+      "add x20, x22, %x[input_offset]\n"
+      "89:"  // Height 6: input setup done
+      "cmp x27, #0x10\n"
+      "blt 93f\n"
+      "cmp x27, #0x20\n"
+      "blt 92f\n"
+      "90:"  // Height 6: Multiply loop: Main loop head
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "ldr q16, [x20, #0x0]\n"
+      "cmp x9, #0x7e\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "add x20, x20, #0x10\n"
+      "blt 91f\n"
+      "uadalp v0.4s, v1.8h\n"
+      "movi v1.8h, #0x0\n"
+      "uadalp v29.4s, v30.8h\n"
+      "movi v30.8h, #0x0\n"
+      "uadalp v26.4s, v27.8h\n"
+      "movi v27.8h, #0x0\n"
+      "uadalp v23.4s, v24.8h\n"
+      "movi v24.8h, #0x0\n"
+      "uadalp v20.4s, v21.8h\n"
+      "movi v21.8h, #0x0\n"
+      "uadalp v17.4s, v18.8h\n"
+      "movi v18.8h, #0x0\n"
+      "mov x9, #0x0\n"
+      "91:"  // Height 6: Multiply loop: unique 6: no collapse
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "uadalp v21.8h, v19.16b\n"
+      "uadalp v18.8h, v16.16b\n"
+      "add x9, x9, #0x1\n"
+      "sub x27, x27, #0x10\n"
+      "cmp x27, #0x20\n"
+      "bge 90b\n"
+      "92:"  // Height 6: Multiply loop: Single iteration only
+      "sub x27, x27, #0x10\n"
+      "ldr q31, [x26, #0x0]\n"
+      "ldr q28, [x25, #0x0]\n"
+      "ldr q25, [x24, #0x0]\n"
+      "ldr q22, [x23, #0x0]\n"
+      "ldr q19, [x22, #0x0]\n"
+      "ldr q16, [x20, #0x0]\n"
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "uadalp v21.8h, v19.16b\n"
+      "uadalp v18.8h, v16.16b\n"
+      "add x26, x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "add x20, x20, #0x10\n"
+      "93:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x27, 102f\n"
+      "tbz x27, #3, 97f\n"
+      "ldr d31, [x26], #0x8\n"
+      "ldr d28, [x25], #0x8\n"
+      "ldr d25, [x24], #0x8\n"
+      "ldr d22, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "tbz x27, #2, 95f\n"
+      "ld1 { v31.s }[2], [x26], #0x4\n"
+      "ld1 { v28.s }[2], [x25], #0x4\n"
+      "ld1 { v25.s }[2], [x24], #0x4\n"
+      "ld1 { v22.s }[2], [x23], #0x4\n"
+      "ld1 { v19.s }[2], [x22], #0x4\n"
+      "ld1 { v16.s }[2], [x20], #0x4\n"
+      "tbz x27, #1, 94f\n"
+      "ld1 { v31.h }[6], [x26], #0x2\n"
+      "ld1 { v28.h }[6], [x25], #0x2\n"
+      "ld1 { v25.h }[6], [x24], #0x2\n"
+      "ld1 { v22.h }[6], [x23], #0x2\n"
+      "ld1 { v19.h }[6], [x22], #0x2\n"
+      "ld1 { v16.h }[6], [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[14], [x26]\n"
+      "ld1 { v28.b }[14], [x25]\n"
+      "ld1 { v25.b }[14], [x24]\n"
+      "ld1 { v22.b }[14], [x23]\n"
+      "ld1 { v19.b }[14], [x22]\n"
+      "ld1 { v16.b }[14], [x20]\n"
+      "b 101f\n"
+      "94:"  // Height 6: Multiply loop: Ragged operand read: partial_1_12
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[12], [x26]\n"
+      "ld1 { v28.b }[12], [x25]\n"
+      "ld1 { v25.b }[12], [x24]\n"
+      "ld1 { v22.b }[12], [x23]\n"
+      "ld1 { v19.b }[12], [x22]\n"
+      "ld1 { v16.b }[12], [x20]\n"
+      "b 101f\n"
+      "95:"  // Height 6: Multiply loop: Ragged operand read: partial_2_8
+      "tbz x27, #1, 96f\n"
+      "ld1 { v31.h }[4], [x26], #0x2\n"
+      "ld1 { v28.h }[4], [x25], #0x2\n"
+      "ld1 { v25.h }[4], [x24], #0x2\n"
+      "ld1 { v22.h }[4], [x23], #0x2\n"
+      "ld1 { v19.h }[4], [x22], #0x2\n"
+      "ld1 { v16.h }[4], [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[10], [x26]\n"
+      "ld1 { v28.b }[10], [x25]\n"
+      "ld1 { v25.b }[10], [x24]\n"
+      "ld1 { v22.b }[10], [x23]\n"
+      "ld1 { v19.b }[10], [x22]\n"
+      "ld1 { v16.b }[10], [x20]\n"
+      "b 101f\n"
+      "96:"  // Height 6: Multiply loop: Ragged operand read: partial_1_8
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[8], [x26]\n"
+      "ld1 { v28.b }[8], [x25]\n"
+      "ld1 { v25.b }[8], [x24]\n"
+      "ld1 { v22.b }[8], [x23]\n"
+      "ld1 { v19.b }[8], [x22]\n"
+      "ld1 { v16.b }[8], [x20]\n"
+      "b 101f\n"
+      "97:"  // Height 6: Multiply loop: Ragged operand read: partial_4_0
+      "tbz x27, #2, 99f\n"
+      "ldr s31, [x26], #0x4\n"
+      "ldr s28, [x25], #0x4\n"
+      "ldr s25, [x24], #0x4\n"
+      "ldr s22, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "tbz x27, #1, 98f\n"
+      "ld1 { v31.h }[2], [x26], #0x2\n"
+      "ld1 { v28.h }[2], [x25], #0x2\n"
+      "ld1 { v25.h }[2], [x24], #0x2\n"
+      "ld1 { v22.h }[2], [x23], #0x2\n"
+      "ld1 { v19.h }[2], [x22], #0x2\n"
+      "ld1 { v16.h }[2], [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[6], [x26]\n"
+      "ld1 { v28.b }[6], [x25]\n"
+      "ld1 { v25.b }[6], [x24]\n"
+      "ld1 { v22.b }[6], [x23]\n"
+      "ld1 { v19.b }[6], [x22]\n"
+      "ld1 { v16.b }[6], [x20]\n"
+      "b 101f\n"
+      "98:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[4], [x26]\n"
+      "ld1 { v28.b }[4], [x25]\n"
+      "ld1 { v25.b }[4], [x24]\n"
+      "ld1 { v22.b }[4], [x23]\n"
+      "ld1 { v19.b }[4], [x22]\n"
+      "ld1 { v16.b }[4], [x20]\n"
+      "b 101f\n"
+      "99:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x27, #1, 100f\n"
+      "ldr h31, [x26], #0x2\n"
+      "ldr h28, [x25], #0x2\n"
+      "ldr h25, [x24], #0x2\n"
+      "ldr h22, [x23], #0x2\n"
+      "ldr h19, [x22], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "tbz x27, #0, 101f\n"
+      "ld1 { v31.b }[2], [x26]\n"
+      "ld1 { v28.b }[2], [x25]\n"
+      "ld1 { v25.b }[2], [x24]\n"
+      "ld1 { v22.b }[2], [x23]\n"
+      "ld1 { v19.b }[2], [x22]\n"
+      "ld1 { v16.b }[2], [x20]\n"
+      "b 101f\n"
+      "100:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b31, [x26, #0x0]\n"
+      "ldr b28, [x25, #0x0]\n"
+      "ldr b25, [x24, #0x0]\n"
+      "ldr b22, [x23, #0x0]\n"
+      "ldr b19, [x22, #0x0]\n"
+      "ldr b16, [x20, #0x0]\n"
+      "101:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "uadalp v1.8h, v31.16b\n"
+      "uadalp v30.8h, v28.16b\n"
+      "uadalp v27.8h, v25.16b\n"
+      "uadalp v24.8h, v22.16b\n"
+      "uadalp v21.8h, v19.16b\n"
+      "uadalp v18.8h, v16.16b\n"
+      "102:"  // Height 6: Multiply loop: No odd multiplies
+      "add x28, x28, #0x1\n"
+      "cmp x28, x21\n"
+      "bne 87b\n"
+      "uadalp v0.4s, v1.8h\n"
+      "uadalp v29.4s, v30.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "uadalp v26.4s, v27.8h\n"
+      "uadalp v23.4s, v24.8h\n"
+      "addp v29.4s, v26.4s, v23.4s\n"
+      "uadalp v20.4s, v21.8h\n"
+      "uadalp v17.4s, v18.8h\n"
+      "addp v0.4s, v0.4s, v29.4s\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "addp v20.4s, v20.4s, v17.4s\n"
+      "mul v0.4s, v0.4s, v2.4s\n"
+      "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+      "addp v20.4s, v20.4s, v20.4s\n"
+      "mul v20.4s, v20.4s, v2.4s\n"
+      "str d20, [%x[out_ptr]], #0x8\n"
+      "beq 104f\n"
+      "tbz %x[flags], #3, 103f\n"
+      "add %x[input_offset], %x[input_offset], #0x6\n"
+      "b 1b\n"
+      "103:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n"
+      "b 1b\n"
+      "104:"  // Exit
+
+      : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
index 1d3aee7911..4669be9993 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
@@ -23,8 +23,10 @@
  */
 #pragma once
 
+#include "convolver.hpp"
 #include "mergeresults.hpp"
 #include "transform.hpp"
+#include "interleave_indirect.hpp"
 
 namespace arm_gemm {
 
@@ -39,14 +41,26 @@ namespace arm_gemm {
  * The optional 'block' parameter is for kernels using dot-product type
  * instructions like UDOT and SDOT.
  */
-template<typename TOperand, typename TResult, unsigned int height, unsigned int width, unsigned int block=1>
+template<typename TOperand, typename TResult, unsigned int height, unsigned int width, unsigned int block=1, bool integrate_sums=false>
 class StdTransformsFixed
 {
 public:
     template<typename TIn>
     void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
-                  const int ymax, const int k0, const int kmax) const {
-        Transform<height, block, false>(out, in, stride, y0, ymax, k0, kmax);
+                  const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) const {
+        Interleave<height, block, VLType::None>(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    template<typename TIn>
+    void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
+                           const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        IndirectInterleave<height, block, VLType::None>(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    template<typename TIn>
+    void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
+                              const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        ConvolutionInterleave<height, block, VLType::None>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
     }
 
     template<typename TIn>
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
index 13c4c477c6..3256d919ea 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
@@ -23,6 +23,7 @@
  */
 #pragma once
 
+#include "convolver.hpp"
 #include "mergeresults.hpp"
 #include "transform.hpp"
 
@@ -38,20 +39,32 @@ namespace arm_gemm {
  * The optional 'block' parameter is for kernels using dot-product type
  * instructions like UDOT and SDOT.
  */
-template<typename TOperand, typename TResult, unsigned int height, unsigned int width_vectors, unsigned int block=1, unsigned int mmla=1>
+template<typename TOperand, typename TResult, unsigned int height, unsigned int width_vectors, unsigned int block=1, unsigned int mmla=1, bool integrate_sums=false>
 class StdTransformsSVE
 {
 public:
     template<typename TIn>
     void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
-                  const int ymax, const int k0, const int kmax) {
-        Transform<height, block, false>(out, in, stride, y0, ymax, k0, kmax);
+                  const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        Interleave<height, block, VLType::None>(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    template<typename TIn>
+    void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
+                           const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        IndirectInterleave<height, block, VLType::None>(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+    }
+
+    template<typename TIn>
+    void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
+                              const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+        ConvolutionInterleave<height, block, VLType::None>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
     }
 
     template<typename TIn>
     void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
                   const int xmax, const int k0, const int kmax) {
-        Transform<width_vectors, block,  true, true>(out, in, stride, x0, xmax, k0, kmax);
+        Transform<width_vectors, block,  true, VLType::SVE>(out, in, stride, x0, xmax, k0, kmax);
     }
 
     template<typename TOut>
diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index c6ea079882..5efeee5d35 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp
@@ -38,13 +38,13 @@ namespace arm_gemm {
  * Need to cope with the work requested in either dimension not actually
  * being a multiple of the block sizes.
  */
-template <unsigned int tIntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize, bool sve>
+template <unsigned int tIntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize, VLType vlt>
 struct TransformImpl {
     template <typename TOut, typename TIn>
     static void Transform(TOut* out, const TIn* const in, const int stride,
                           const int y0, const int ymax, const int x0, const int xmax) {
         // For SVE cases we multiply the interleave factor by the vector length.
-        const unsigned int IntBy = tIntBy * (sve ? get_vector_length<TOut>() / BlockBy : 1);
+        const unsigned int IntBy = tIntBy * (vlt == VLType::SVE ? get_vector_length<TOut>() / BlockBy : 1);
 
         const int n_whole_y_blocks = (ymax - y0) / IntBy;
         const int y_remainders = (ymax - y0) % IntBy;
@@ -105,13 +105,13 @@ struct TransformImpl {
 };
 
 /*****************************************************************************/
-template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, bool sve=false, typename TOut, typename TIn>
+template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, VLType vlt=VLType::None, typename TOut, typename TIn>
 void Transform(
   TOut* out, const TIn* const in, const int stride,
   const int k0, const int kmax, const int x0, const int xmax
 ) {
   // Redirect to a specialised implementation predicated on argument size.
-  TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn), sve>::Transform(
+  TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn), vlt>::Transform(
     out, in, stride, k0, kmax, x0, xmax
   );
 }
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
deleted file mode 100644
index 2df5d1bd28..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<typename T>
-inline void TransformImpl<6, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
-    const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-    bool first = true;
-
-    uint32_t zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop
-
-    for (int y=y0; y<ymax; y+=6) {
-        const uint32_t *inptr0 = inptr + y * ldin + k0;
-        const uint32_t *inptr1 = inptr0 + ldin;
-        const uint32_t *inptr2 = inptr1 + ldin;
-        const uint32_t *inptr3 = inptr2 + ldin;
-        const uint32_t *inptr4 = inptr3 + ldin;
-        const uint32_t *inptr5 = inptr4 + ldin;
-
-        //prefetch_2x(inptr0);
-        //prefetch_2x(inptr1);
-        //prefetch_2x(inptr2);
-        //prefetch_2x(inptr3);
-        //prefetch_2x(inptr4);
-        //prefetch_2x(inptr5);
-
-        int x=(kmax-k0);
-        for (;(x>7) || first;x-=8) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
-            if ((y + 5) >= ymax) {
-                switch ((y + 5) - ymax) {
-                    case 4:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr5 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x<=7) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            __asm __volatile (
-                // Load up 8 elements (2 vectors) from each of 8 sources.
-                "VLD1.32	{d0-d3}, [%[inptr0]]!\n"   // q0=A0A1A2A3
-                "VLD1.32	{d4-d7}, [%[inptr1]]!\n"   // q2=B0B1B2B3
-                "VLD1.32	{d8-d11}, [%[inptr2]]!\n"  // q4=C0C1C2C3
-                "VZIP.32	q0, q4\n"     // q0=A0C0A1C1, q4 = A2C2A3C3
-                "VLD1.32	{d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
-                "VZIP.32	q2, q6\n"     // q2=B0D0B1D1, q6 = B2D2B3D3
-                "VLD1.32	{d16-d19}, [%[inptr4]]!\n"
-                "VLD1.32	{d20-d23}, [%[inptr5]]!\n"
-                "VZIP.32	q8, q10\n"    // q8=E0F0E1F1, q10 = E2F2E3F3
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "VZIP.32	q0, q2\n"    // q0 = A0B0C0D0, q2 = A1B1C1D1
-
-                // Store first elements
-                "VST1.32	{d0-d1}, [%[outptr]]!\n"
-                "VST1.32	{d16}, [%[outptr]]!\n"
-
-                "VZIP.32	q4, q6\n"    // q4 = A2B2C2D2, q6 = A3B3C3D3
-
-                // Store second elements
-                "VST1.32	{d4-d5}, [%[outptr]]!\n"
-                "VZIP.32	q1, q5\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "VST1.32	{d17}, [%[outptr]]!\n"
-                "VZIP.32	q3, q7\n"
-
-                // Store third elements
-                "VZIP.32	q9, q11\n"
-                "VST1.32	{d8-d9}, [%[outptr]]!\n"
-                "VZIP.32	q1, q3\n"
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "VST1.32	{d20}, [%[outptr]]!\n"
-
-                // Store fourth elements
-                "VZIP.32	q5, q7\n"
-                "VST1.32	{d12-d13}, [%[outptr]]!\n"
-                ASM_PREFETCH("[%[inptr3], #128]")
-                "VST1.32	{d21}, [%[outptr]]!\n"
-
-                // Fifth
-                "VST1.32	{d2-d3}, [%[outptr]]!\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "VST1.32	{d18}, [%[outptr]]!\n"
-
-                // Sixth
-                "VST1.32	{d6-d7}, [%[outptr]]!\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "VST1.32	{d19}, [%[outptr]]!\n"
-
-                // Seventh
-                "VST1.32	{d10-d11}, [%[outptr]]!\n"
-                "VST1.32	{d22}, [%[outptr]]!\n"
-
-                // Eighth
-                "VST1.32	{d14-d15}, [%[outptr]]!\n"
-                "VST1.32	{d23}, [%[outptr]]!\n"
-
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
-                :
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "memory"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-        }
-    }
-}
-
-#endif  // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
index 8f0b8ae63f..3ce1d328a7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
@@ -30,22 +30,22 @@
 // Generic unblocked transposed 8x32-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<8, 1, true, 4, 4, false>::Transform(
+inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
   // Redirect to a 16x uint16_t specialisation
-  TransformImpl<16, 1, true, 2, 2, false>::Transform(
+  TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     reinterpret_cast<uint16_t *>(out),
     reinterpret_cast<const uint16_t *>(in),
     stride*2, x0*2, xmax*2, k0, kmax
   );
 }
 
-// Generic 12x16-bit sized specialisation
+// Generic 16x16-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
@@ -117,7 +117,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(con
 
 template <>
 template <>
-inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     uint16_t* out, const uint16_t* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
deleted file mode 100644
index 9b6f4de543..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-#include "../utils.hpp"
-
-template<>
-template<typename T>
-void TransformImpl<4, 16, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint8_t *outptr = (uint8_t *)out;
-    const uint8_t *inptr = (uint8_t *)in;
-
-    uint8_t zerobuff[16] = { 0 };
-
-    for (int y=y0; y<ymax; y+=4) {
-        const uint8_t *inptr0 = inptr + static_cast<intptr_t>(y) * ldin + k0;
-        const uint8_t *inptr1 = inptr0 + ldin;
-        const uint8_t *inptr2 = inptr1 + ldin;
-        const uint8_t *inptr3 = inptr2 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-
-        int x=(kmax-k0);
-        for (;x>15;x-=16) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if ((y + 3) >= ymax) {
-                switch ((y + 3) - ymax) {
-                    case 2:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr3 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            __asm __volatile (
-                "LDR	q0, [%[inptr0]], #16\n"
-                ASM_PREFETCH("[%[inptr0], #176]")
-                "LDR	q1, [%[inptr1]], #16\n"
-                ASM_PREFETCH("[%[inptr1], #176]")
-                "STP	q0, q1, [%[outptr]], #32\n"
-                "LDR	q0, [%[inptr2]], #16\n"
-                ASM_PREFETCH("[%[inptr2], #176]")
-                "LDR	q1, [%[inptr3]], #16\n"
-                ASM_PREFETCH("[%[inptr3], #176]")
-                "STP	q0, q1, [%[outptr]], #32\n"
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [outptr] "+r" (outptr)
-                :
-                : "v0", "v1"
-            );
-        }
-
-        if (x>0) {
-            /* Need to duplicate this here, in case we didn't run the main loop. */
-            if ((y + 3) >= ymax) {
-                switch ((y + 3) - ymax) {
-                    case 2:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr3 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */
-            auto f = [&outptr, x](const uint8_t *&p) {
-                for (int i=0; i<16; i++) {
-                    if (i < x) {
-                        *outptr++ = *p++;
-                    } else {
-                        *outptr++ = 0;
-                    }
-                }
-            };
-
-            f(inptr0);
-            f(inptr1);
-            f(inptr2);
-            f(inptr3);
-        }
-    }
-}
-
-#endif  // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
deleted file mode 100644
index 3d912c4675..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<typename T>
-void TransformImpl<8, 1, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint16_t *outptr = (uint16_t *)out;
-    const uint16_t *inptr = (const uint16_t *)in;
-    bool first=true;
-
-    uint16_t zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop
-
-    for (int y=y0; y<ymax; y+=8) {
-        const uint16_t *inptr0 = inptr + y * ldin + k0;
-        const uint16_t *inptr1 = inptr0 + ldin;
-        const uint16_t *inptr2 = inptr1 + ldin;
-        const uint16_t *inptr3 = inptr2 + ldin;
-        const uint16_t *inptr4 = inptr3 + ldin;
-        const uint16_t *inptr5 = inptr4 + ldin;
-        const uint16_t *inptr6 = inptr5 + ldin;
-        const uint16_t *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;(x>7) || first;x-=8) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 5:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 4:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr5 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr6 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr7 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x <= 7) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            int skippf = (x & 31);
-            __asm __volatile (
-                // Load up 8 elements (1 vector) from each of 8 sources.
-                "CBNZ	%w[skippf], 1f\n"
-                ASM_PREFETCH("[%[inptr0], #128]")
-                ASM_PREFETCH("[%[inptr1], #128]")
-                ASM_PREFETCH("[%[inptr2], #128]")
-                ASM_PREFETCH("[%[inptr3], #128]")
-                "1:\n"
-
-                "LDR	q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
-                "LDR	q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
-                "LDR	q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
-                "LDR	q6, [%[inptr6]], #16\n"
-                "ZIP1	v8.8h, v0.8h, v4.8h\n"  // q8=A0E0A1E1A2E2A3E3
-                "ZIP2	v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
-                "ZIP1	v9.8h, v2.8h, v6.8h\n"  // q9=C0G0C1G1C2G2C3G3
-                "ZIP2	v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
-                "LDR	q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
-                "LDR	q5, [%[inptr5]], #16\n"
-                "LDR	q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
-                "LDR	q7, [%[inptr7]], #16\n"
-                "ZIP1	v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
-                "ZIP2	v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
-                "ZIP1	v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
-                "ZIP2	v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
-
-                "ZIP1	v12.8h,  v8.8h,  v9.8h\n" // q20=A0C0E0G0A1C1E1G1
-                "ZIP2	v20.8h,  v8.8h,  v9.8h\n"
-                "ZIP1	v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
-                "ZIP2	v21.8h, v10.8h, v11.8h\n"
-
-                "CBNZ	%w[skippf], 2f\n"
-                ASM_PREFETCH("[%[inptr4], #112]")
-                ASM_PREFETCH("[%[inptr5], #112]")
-                ASM_PREFETCH("[%[inptr6], #112]")
-                ASM_PREFETCH("[%[inptr7], #112]")
-                "2:\n"
-
-                "ZIP1	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v30.8h, v16.8h, v17.8h\n"
-                "ZIP1	v23.8h, v18.8h, v19.8h\n"
-                "ZIP2	v31.8h, v18.8h, v19.8h\n"
-
-                "ZIP1	v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
-                "ZIP2	v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
-                "STP	q14, q15, [%[outptr]], #32\n" // Write back first two elements
-
-                "ZIP1	v0.8h, v20.8h, v21.8h\n"
-                "ZIP2	v1.8h, v20.8h, v21.8h\n"
-                "STP	q0, q1, [%[outptr]], #32\n" // Write back next two elements
-
-                "ZIP1	v2.8h, v22.8h, v23.8h\n"
-                "ZIP2	v3.8h, v22.8h, v23.8h\n"
-                "STP	q2, q3, [%[outptr]], #32\n" // Write back next two elements
-
-                "ZIP1	v4.8h, v30.8h, v31.8h\n"
-                "ZIP2	v5.8h, v30.8h, v31.8h\n"
-                "STP	q4, q5, [%[outptr]], #32\n" // Write back last two elements
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                : [skippf] "r" (skippf)
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
-                  "v25", "v26", "v27", "v28", "v29", "v30", "v31", "memory"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
deleted file mode 100644
index 701d688af2..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint32_t *outptr = (uint32_t *)out;
-    const uint32_t *inptr = (uint32_t *)in;
-    bool first = true;
-
-    uint32_t zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop
-
-    for (int y=y0; y<ymax; y+=8) {
-        const uint32_t *inptr0 = inptr + y * ldin + k0;
-        const uint32_t *inptr1 = inptr0 + ldin;
-        const uint32_t *inptr2 = inptr1 + ldin;
-        const uint32_t *inptr3 = inptr2 + ldin;
-        const uint32_t *inptr4 = inptr3 + ldin;
-        const uint32_t *inptr5 = inptr4 + ldin;
-        const uint32_t *inptr6 = inptr5 + ldin;
-        const uint32_t *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;(x>7) || first;x-=8) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 5:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 4:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr5 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr6 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr7 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x<=7) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            __asm __volatile (
-                // Load up 8 elements (2 vectors) from each of 8 sources.
-                "LDP        q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
-                "LDP        q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
-                "LDP        q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
-                "ZIP1       v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "LDP        q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
-                "ZIP1       v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
-                "LDP        q8, q9, [%[inptr4]], #32\n"
-                "LDP        q10, q11, [%[inptr5]], #32\n"
-                "LDP        q12, q13, [%[inptr6]], #32\n"
-                "ZIP1       v18.4s, v8.4s, v12.4s\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "LDP        q14, q15, [%[inptr7]], #32\n"
-                "ZIP1       v19.4s, v10.4s, v14.4s\n"
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2       v16.4s, v0.4s, v4.4s\n"
-                ASM_PREFETCH("[%[inptr3], #128]")
-                "ZIP2       v17.4s, v2.4s, v6.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
-
-                "ZIP2       v18.4s, v8.4s, v12.4s\n"
-                "ZIP2       v19.4s, v10.4s, v14.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP1       v16.4s, v1.4s, v5.4s\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP1       v17.4s, v3.4s, v7.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Third element
-
-                "ZIP1       v18.4s, v9.4s, v13.4s\n"
-                "ZIP1       v19.4s, v11.4s, v15.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Fourth element
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                ASM_PREFETCH("[%[inptr6], #128]")
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2       v16.4s, v1.4s, v5.4s\n"
-                "ZIP2       v17.4s, v3.4s, v7.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Fifth element
-
-                "ZIP2       v18.4s, v9.4s, v13.4s\n"
-                ASM_PREFETCH("[%[inptr7], #128]")
-                "ZIP2       v19.4s, v11.4s, v15.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Sixth element
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Seventh element
-
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                :
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif  // __aarch64__ && !__ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp
deleted file mode 100644
index 2546cc571a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint8_t *outptr = reinterpret_cast<uint8_t *>(out);
-    const uint8_t *inptr = reinterpret_cast<const uint8_t *>(in);
-    bool first = true;
-
-    /* Helper functions to copy blocks about used for odd case. */
-    class t {
-    public:
-        static inline void copy_4_inc(uint8_t *&out, const uint8_t *&in) {
-            uint32_t *out_word = reinterpret_cast<uint32_t *>(out);
-            const uint32_t *in_word = reinterpret_cast<const uint32_t *>(in);
-
-            *out_word++ = *in_word++;
-
-            out = reinterpret_cast<uint8_t *>(out_word);
-            in = reinterpret_cast<const uint8_t *>(in_word);
-        }
-
-        static inline void copy_pad(uint8_t *&out, const uint8_t *&in, size_t count) {
-            for (unsigned int i=0; i<4; i++) {
-                if (i < count) {
-                    *out++ = *in++;
-                } else {
-                    *out++ = 0;
-                }
-            }
-        }
-    };
-
-    uint8_t zerobuff[64] = { 0 }; // 32 for asm loop plus up to 31 for overflow loop
-
-    for (int y=y0; y<ymax; y+=8) {
-        const uint8_t *inptr0 = inptr + y * ldin + k0;
-        const uint8_t *inptr1 = inptr0 + ldin;
-        const uint8_t *inptr2 = inptr1 + ldin;
-        const uint8_t *inptr3 = inptr2 + ldin;
-        const uint8_t *inptr4 = inptr3 + ldin;
-        const uint8_t *inptr5 = inptr4 + ldin;
-        const uint8_t *inptr6 = inptr5 + ldin;
-        const uint8_t *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;(x>31) || first;x-=32) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=32. */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 5:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 4:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr5 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr6 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr7 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x<=31) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            __asm __volatile (
-                // Load up 8 elements (2 vectors) from each of 8 sources.
-                "LDP        q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
-                "LDP        q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
-                "LDP        q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
-                "ZIP1       v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "LDP        q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
-                "ZIP1       v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
-                "LDP        q8, q9, [%[inptr4]], #32\n"
-                "LDP        q10, q11, [%[inptr5]], #32\n"
-                "LDP        q12, q13, [%[inptr6]], #32\n"
-                "ZIP1       v18.4s, v8.4s, v12.4s\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "LDP        q14, q15, [%[inptr7]], #32\n"
-                "ZIP1       v19.4s, v10.4s, v14.4s\n"
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2       v16.4s, v0.4s, v4.4s\n"
-                ASM_PREFETCH("[%[inptr3], #128]")
-                "ZIP2       v17.4s, v2.4s, v6.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
-
-                "ZIP2       v18.4s, v8.4s, v12.4s\n"
-                "ZIP2       v19.4s, v10.4s, v14.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP1       v16.4s, v1.4s, v5.4s\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP1       v17.4s, v3.4s, v7.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Third element
-
-                "ZIP1       v18.4s, v9.4s, v13.4s\n"
-                "ZIP1       v19.4s, v11.4s, v15.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Fourth element
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                ASM_PREFETCH("[%[inptr6], #128]")
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2       v16.4s, v1.4s, v5.4s\n"
-                "ZIP2       v17.4s, v3.4s, v7.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Fifth element
-
-                "ZIP2       v18.4s, v9.4s, v13.4s\n"
-                ASM_PREFETCH("[%[inptr7], #128]")
-                "ZIP2       v19.4s, v11.4s, v15.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Sixth element
-
-                "ZIP1       v20.4s, v16.4s, v17.4s\n"
-                "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "STP        q20, q21, [%[outptr]], #32\n" // Seventh element
-
-                "ZIP2       v22.4s, v16.4s, v17.4s\n"
-                "ZIP2       v23.4s, v18.4s, v19.4s\n"
-                "STP        q22, q23, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                :
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
-            );
-        }
-
-        // Copy any leftover blocks of 4 a complete block at a time.
-        for (;x>4;x-=4) {
-            t::copy_4_inc(outptr, inptr0);
-            t::copy_4_inc(outptr, inptr1);
-            t::copy_4_inc(outptr, inptr2);
-            t::copy_4_inc(outptr, inptr3);
-            t::copy_4_inc(outptr, inptr4);
-            t::copy_4_inc(outptr, inptr5);
-            t::copy_4_inc(outptr, inptr6);
-            t::copy_4_inc(outptr, inptr7);
-        }
-
-        // Final block with padding, if any.
-        if (x > 0) {
-            t::copy_pad(outptr, inptr0, x);
-            t::copy_pad(outptr, inptr1, x);
-            t::copy_pad(outptr, inptr2, x);
-            t::copy_pad(outptr, inptr3, x);
-            t::copy_pad(outptr, inptr4, x);
-            t::copy_pad(outptr, inptr5, x);
-            t::copy_pad(outptr, inptr6, x);
-            t::copy_pad(outptr, inptr7, x);
-        }
-    }
-}
-
-#endif  // __aarch64__ && !__ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
deleted file mode 100644
index a342d6c3d1..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<>
-inline void TransformImpl<8, 1, false, 4, 2, false>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    float *outptr = out;
-    const __fp16 *inptr = in;
-    bool first = true;
-
-    __fp16 zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop
-
-    for (int y=y0; y<ymax; y+=8) {
-        const __fp16 *inptr0 = inptr + y * ldin + k0;
-        const __fp16 *inptr1 = inptr0 + ldin;
-        const __fp16 *inptr2 = inptr1 + ldin;
-        const __fp16 *inptr3 = inptr2 + ldin;
-        const __fp16 *inptr4 = inptr3 + ldin;
-        const __fp16 *inptr5 = inptr4 + ldin;
-        const __fp16 *inptr6 = inptr5 + ldin;
-        const __fp16 *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;(x>7) || first;x-=8) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 5:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 4:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr5 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr6 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr7 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x<=7) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            __asm __volatile (
-                // Load up 8 elements (2 vectors) from each of 8 sources.
-                "LDR	q0, [%[inptr0]], #16\n"
-                "LDR	q2, [%[inptr1]], #16\n"
-                "FCVTL2	v1.4s, v0.8h\n"
-                "FCVTL	v0.4s, v0.4h\n"
-                "LDR	q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
-                "FCVTL2	v3.4s, v2.8h\n"
-                "FCVTL	v2.4s, v2.4h\n"
-                "FCVTL2	v5.4s, v4.8h\n"
-                "FCVTL	v4.4s, v4.4h\n"
-                "ZIP1	v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "LDR	q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
-                "FCVTL2	v7.4s, v6.8h\n"
-                "FCVTL	v6.4s, v6.4h\n"
-                "ZIP1	v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
-                "LDR	q8, [%[inptr4]], #16\n"
-                "LDR	q10, [%[inptr5]], #16\n"
-                "FCVTL2	v9.4s, v8.8h\n"
-                "FCVTL	v8.4s, v8.4h\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "LDR	q12, [%[inptr6]], #16\n"
-                "FCVTL2	v11.4s, v10.8h\n"
-                "FCVTL	v10.4s, v10.4h\n"
-                "FCVTL2	v13.4s, v12.8h\n"
-                "FCVTL	v12.4s, v12.4h\n"
-                "ZIP1	v18.4s, v8.4s, v12.4s\n"
-                "LDR	q14, [%[inptr7]], #16\n"
-                "FCVTL2	v15.4s, v14.8h\n"
-                "FCVTL	v14.4s, v14.4h\n"
-                "ZIP1	v19.4s, v10.4s, v14.4s\n"
-
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1	v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
-                "ZIP1	v21.4s, v18.4s, v19.4s\n"
-                "ZIP2	v22.4s, v16.4s, v17.4s\n"
-                "ZIP2	v23.4s, v18.4s, v19.4s\n"
-                ASM_PREFETCH("[%[inptr3], #128]")
-
-                "ZIP2	v16.4s, v0.4s, v4.4s\n"
-                "ZIP2	v17.4s, v2.4s, v6.4s\n"
-                "STP	q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
-
-                "ZIP2	v18.4s, v8.4s, v12.4s\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP2	v19.4s, v10.4s, v14.4s\n"
-                "STP	q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
-
-                "ZIP1	v20.4s, v16.4s, v17.4s\n"
-                "ZIP1	v21.4s, v18.4s, v19.4s\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP2	v22.4s, v16.4s, v17.4s\n"
-                "ZIP2	v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP1	v16.4s, v1.4s, v5.4s\n"
-                "ZIP1	v17.4s, v3.4s, v7.4s\n"
-                ASM_PREFETCH("[%[inptr6], #128]")
-                "STP	q20, q21, [%[outptr]], #32\n" // Third element
-
-                "ZIP1	v18.4s, v9.4s, v13.4s\n"
-                "ZIP1	v19.4s, v11.4s, v15.4s\n"
-                "STP	q22, q23, [%[outptr]], #32\n" // Fourth element
-                ASM_PREFETCH("[%[inptr7], #128]")
-
-                "ZIP1	v20.4s, v16.4s, v17.4s\n"
-                "ZIP1	v21.4s, v18.4s, v19.4s\n"
-                "ZIP2	v22.4s, v16.4s, v17.4s\n"
-                "ZIP2	v23.4s, v18.4s, v19.4s\n"
-
-                "ZIP2	v16.4s, v1.4s, v5.4s\n"
-                "ZIP2	v17.4s, v3.4s, v7.4s\n"
-                "STP	q20, q21, [%[outptr]], #32\n" // Fifth element
-
-                "ZIP2	v18.4s, v9.4s, v13.4s\n"
-                "ZIP2	v19.4s, v11.4s, v15.4s\n"
-                "STP	q22, q23, [%[outptr]], #32\n" // Sixth element
-
-                "ZIP1	v20.4s, v16.4s, v17.4s\n"
-                "ZIP1	v21.4s, v18.4s, v19.4s\n"
-                "STP	q20, q21, [%[outptr]], #32\n" // Seventh element
-
-                "ZIP2	v22.4s, v16.4s, v17.4s\n"
-                "ZIP2	v23.4s, v18.4s, v19.4s\n"
-                "STP	q22, q23, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                :
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp
deleted file mode 100644
index 37344a82a9..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
-
-#include <arm_neon.h>
-#include <cstdint>
-
-#include "../asmlib.hpp"
-
-template<>
-template<>
-inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(int16_t *out, const int8_t *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    int16_t *outptr = out;
-    const int8_t *inptr = in;
-    bool first = true;
-
-    int8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop
-
-    for (int y=y0; y<ymax; y+=8) {
-        const int8_t *inptr0 = inptr + y * ldin + k0;
-        const int8_t *inptr1 = inptr0 + ldin;
-        const int8_t *inptr2 = inptr1 + ldin;
-        const int8_t *inptr3 = inptr2 + ldin;
-        const int8_t *inptr4 = inptr3 + ldin;
-        const int8_t *inptr5 = inptr4 + ldin;
-        const int8_t *inptr6 = inptr5 + ldin;
-        const int8_t *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;(x>15) || first;x-=16) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 5:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 4:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr5 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr6 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr7 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x<=15) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            __asm __volatile (
-                // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources.
-                "LDR	q0, [%[inptr0]], #16\n"
-                "LDR	q2, [%[inptr1]], #16\n"
-                "SSHLL2 v1.8h, v0.16b, #0\n"
-                "SSHLL  v0.8h, v0.8b, #0\n"
-                "LDR	q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
-                "SSHLL2 v3.8h, v2.16b, #0\n"
-                "SSHLL  v2.8h, v2.8b, #0\n"
-                "SSHLL2 v5.8h, v4.16b, #0\n"
-                "SSHLL  v4.8h, v4.8b, #0\n"
-                "ZIP1	v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "LDR	q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
-                "SSHLL2 v7.8h, v6.16b, #0\n"
-                "SSHLL  v6.8h, v6.8b, #0\n"
-                "ZIP1	v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1
-                "LDR	q8, [%[inptr4]], #16\n"
-                "LDR	q10, [%[inptr5]], #16\n"
-                "SSHLL2 v9.8h, v8.16b, #0\n"
-                "SSHLL  v8.8h, v8.8b, #0\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "LDR	q12, [%[inptr6]], #16\n"
-                "SSHLL2 v11.8h, v10.16b, #0\n"
-                "SSHLL  v10.8h, v10.8b, #0\n"
-                "SSHLL2 v13.8h, v12.16b, #0\n"
-                "SSHLL  v12.8h, v12.8b, #0\n"
-                "ZIP1	v18.8h, v8.8h, v12.8h\n"
-                "LDR	q14, [%[inptr7]], #16\n"
-                "SSHLL2 v15.8h, v14.16b, #0\n"
-                "SSHLL  v14.8h, v14.8b, #0\n"
-                "ZIP1	v19.8h, v10.8h, v14.8h\n"
-
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1	v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1
-                "ZIP1	v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1
-                "ZIP2	v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3
-                "ZIP2	v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3
-                ASM_PREFETCH("[%[inptr3], #128]")
-
-                "ZIP2	v16.8h, v0.8h, v4.8h\n"
-                "ZIP2	v17.8h, v2.8h, v6.8h\n"
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP2	v18.8h, v8.8h, v12.8h\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP2	v19.8h, v10.8h, v14.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Write back the first element of each source
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-
-                "ZIP1	v20.8h, v16.8h, v17.8h\n"
-                "ZIP1	v21.8h, v18.8h, v19.8h\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP2	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v23.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Write back the second element of each source
-
-                "ZIP1	v16.8h, v1.8h, v5.8h\n"
-                "ZIP1	v17.8h, v3.8h, v7.8h\n"
-                ASM_PREFETCH("[%[inptr6], #128]")
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP1	v18.8h, v9.8h, v13.8h\n"
-                "ZIP1	v19.8h, v11.8h, v15.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Third element
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-                ASM_PREFETCH("[%[inptr7], #128]")
-
-                "ZIP1	v20.8h, v16.8h, v17.8h\n"
-                "ZIP1	v21.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Fourth element
-                "ZIP2	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v23.8h, v18.8h, v19.8h\n"
-
-                "ZIP2	v16.8h, v1.8h, v5.8h\n"
-                "ZIP2	v17.8h, v3.8h, v7.8h\n"
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP2	v18.8h, v9.8h, v13.8h\n"
-                "ZIP2	v19.8h, v11.8h, v15.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Fifth element
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-
-                "ZIP1	v20.8h, v16.8h, v17.8h\n"
-                "ZIP1	v21.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Sixth element
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP2	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v23.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Seventh element
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                :
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp
deleted file mode 100644
index a3a269c9cd..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
-
-#include <arm_neon.h>
-#include <cstdint>
-
-#include "../asmlib.hpp"
-
-template<>
-template<>
-inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(uint16_t *out, const uint8_t *in, int ldin, int y0, int ymax, int k0, int kmax) {
-    uint16_t *outptr = out;
-    const uint8_t *inptr = in;
-    bool first = true;
-
-    uint8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop
-
-    for (int y=y0; y<ymax; y+=8) {
-        const uint8_t *inptr0 = inptr + y * ldin + k0;
-        const uint8_t *inptr1 = inptr0 + ldin;
-        const uint8_t *inptr2 = inptr1 + ldin;
-        const uint8_t *inptr3 = inptr2 + ldin;
-        const uint8_t *inptr4 = inptr3 + ldin;
-        const uint8_t *inptr5 = inptr4 + ldin;
-        const uint8_t *inptr6 = inptr5 + ldin;
-        const uint8_t *inptr7 = inptr6 + ldin;
-
-        prefetch_2x(inptr0);
-        prefetch_2x(inptr1);
-        prefetch_2x(inptr2);
-        prefetch_2x(inptr3);
-        prefetch_2x(inptr4);
-        prefetch_2x(inptr5);
-        prefetch_2x(inptr6);
-        prefetch_2x(inptr7);
-
-        int x=(kmax-k0);
-        for (;(x>15) || first;x-=16) {
-            /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
-            if ((y + 7) >= ymax) {
-                switch ((y + 7) - ymax) {
-                    case 6:
-                        inptr1 = zerobuff;
-                        // fall through
-                    case 5:
-                        inptr2 = zerobuff;
-                        // fall through
-                    case 4:
-                        inptr3 = zerobuff;
-                        // fall through
-                    case 3:
-                        inptr4 = zerobuff;
-                        // fall through
-                    case 2:
-                        inptr5 = zerobuff;
-                        // fall through
-                    case 1:
-                        inptr6 = zerobuff;
-                        // fall through
-                    case 0:
-                        inptr7 = zerobuff;
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-            }
-
-            if (first) {
-                if (x<=15) {
-                    break;
-                }
-
-                first = false;
-            }
-
-            __asm __volatile (
-                // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources.
-                "LDR	q0, [%[inptr0]], #16\n"
-                "LDR	q2, [%[inptr1]], #16\n"
-                "USHLL2 v1.8h, v0.16b, #0\n"
-                "USHLL  v0.8h, v0.8b, #0\n"
-                "LDR	q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
-                "USHLL2 v3.8h, v2.16b, #0\n"
-                "USHLL  v2.8h, v2.8b, #0\n"
-                "USHLL2 v5.8h, v4.16b, #0\n"
-                "USHLL  v4.8h, v4.8b, #0\n"
-                "ZIP1	v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1
-                ASM_PREFETCH("[%[inptr0], #128]")
-                "LDR	q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
-                "USHLL2 v7.8h, v6.16b, #0\n"
-                "USHLL  v6.8h, v6.8b, #0\n"
-                "ZIP1	v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1
-                "LDR	q8, [%[inptr4]], #16\n"
-                "LDR	q10, [%[inptr5]], #16\n"
-                "USHLL2 v9.8h, v8.16b, #0\n"
-                "USHLL  v8.8h, v8.8b, #0\n"
-                ASM_PREFETCH("[%[inptr1], #128]")
-                "LDR	q12, [%[inptr6]], #16\n"
-                "USHLL2 v11.8h, v10.16b, #0\n"
-                "USHLL  v10.8h, v10.8b, #0\n"
-                "USHLL2 v13.8h, v12.16b, #0\n"
-                "USHLL  v12.8h, v12.8b, #0\n"
-                "ZIP1	v18.8h, v8.8h, v12.8h\n"
-                "LDR	q14, [%[inptr7]], #16\n"
-                "USHLL2 v15.8h, v14.16b, #0\n"
-                "USHLL  v14.8h, v14.8b, #0\n"
-                "ZIP1	v19.8h, v10.8h, v14.8h\n"
-
-                ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1	v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1
-                "ZIP1	v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1
-                "ZIP2	v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3
-                "ZIP2	v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3
-                ASM_PREFETCH("[%[inptr3], #128]")
-
-                "ZIP2	v16.8h, v0.8h, v4.8h\n"
-                "ZIP2	v17.8h, v2.8h, v6.8h\n"
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP2	v18.8h, v8.8h, v12.8h\n"
-                ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP2	v19.8h, v10.8h, v14.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Write back the first element of each source
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-
-                "ZIP1	v20.8h, v16.8h, v17.8h\n"
-                "ZIP1	v21.8h, v18.8h, v19.8h\n"
-                ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP2	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v23.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Write back the second element of each source
-
-                "ZIP1	v16.8h, v1.8h, v5.8h\n"
-                "ZIP1	v17.8h, v3.8h, v7.8h\n"
-                ASM_PREFETCH("[%[inptr6], #128]")
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP1	v18.8h, v9.8h, v13.8h\n"
-                "ZIP1	v19.8h, v11.8h, v15.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Third element
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-                ASM_PREFETCH("[%[inptr7], #128]")
-
-                "ZIP1	v20.8h, v16.8h, v17.8h\n"
-                "ZIP1	v21.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Fourth element
-                "ZIP2	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v23.8h, v18.8h, v19.8h\n"
-
-                "ZIP2	v16.8h, v1.8h, v5.8h\n"
-                "ZIP2	v17.8h, v3.8h, v7.8h\n"
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP2	v18.8h, v9.8h, v13.8h\n"
-                "ZIP2	v19.8h, v11.8h, v15.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Fifth element
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-
-                "ZIP1	v20.8h, v16.8h, v17.8h\n"
-                "ZIP1	v21.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Sixth element
-                "TRN1	v24.2d, v20.2d, v21.2d\n"
-                "TRN2	v25.2d, v20.2d, v21.2d\n"
-
-                "ZIP2	v22.8h, v16.8h, v17.8h\n"
-                "ZIP2	v23.8h, v18.8h, v19.8h\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Seventh element
-                "TRN1	v24.2d, v22.2d, v23.2d\n"
-                "TRN2	v25.2d, v22.2d, v23.2d\n"
-                "STP	q24, q25, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
-                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
-                :
-                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory"
-            );
-        }
-
-        for (;x>0;x--) {
-            *outptr++ = *inptr0++;
-            *outptr++ = *inptr1++;
-            *outptr++ = *inptr2++;
-            *outptr++ = *inptr3++;
-            *outptr++ = *inptr4++;
-            *outptr++ = *inptr5++;
-            *outptr++ = *inptr6++;
-            *outptr++ = *inptr7++;
-        }
-    }
-}
-
-#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
index 5ab5774751..f6233ef503 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
@@ -30,12 +30,12 @@
 // Generic unblocked transposed 6x32-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<6, 1, true, 4, 4, false>::Transform(
+inline void TransformImpl<6, 1, true, 4, 4, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
   // Redirect to a 12 x uint16_t specialisation
-  TransformImpl<12, 1, true, 2, 2, false>::Transform(
+  TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
     reinterpret_cast<uint16_t *>(out),
     reinterpret_cast<const uint16_t *>(in),
     stride*2, x0*2, xmax*2, k0, kmax
@@ -45,7 +45,7 @@ inline void TransformImpl<6, 1, true, 4, 4, false>::Transform(
 // Generic 12x16-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<12, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
@@ -135,7 +135,7 @@ inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(con
 
 template <>
 template <>
-inline void TransformImpl<12, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
     uint16_t* out, const uint16_t* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
index d7de9ff934..c0f3e17d31 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
@@ -110,7 +110,7 @@ inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __
 
 template <>
 template <>
-inline void TransformImpl<12, 1, true, 4, 2, false>::Transform(
+inline void TransformImpl<12, 1, true, 4, 2, VLType::None>::Transform(
     float* out, const __fp16* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
index a137f9360a..bcbe2b84d8 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
@@ -30,12 +30,12 @@
 // Generic unblocked transposed 12x32-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<12, 1, true, 4, 4, false>::Transform(
+inline void TransformImpl<12, 1, true, 4, 4, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
   // Redirect to a 24 x uint16_t specialisation
-  TransformImpl<24, 1, true, 2, 2, false>::Transform(
+  TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
     reinterpret_cast<uint16_t *>(out),
     reinterpret_cast<const uint16_t * const>(in),
     stride*2, x0*2, xmax*2, k0, kmax
@@ -45,7 +45,7 @@ inline void TransformImpl<12, 1, true, 4, 4, false>::Transform(
 // Generic 24x16-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<24, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
@@ -120,7 +120,7 @@ inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(con
 
 template <>
 template <>
-inline void TransformImpl<24, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
     uint16_t* out, const uint16_t* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
index 974be481e7..df68740bb4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
@@ -30,12 +30,12 @@
 // Generic unblocked transposed 8x32-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<8, 1, true, 4, 4, false>::Transform(
+inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
   // Redirect to a 16 x uint16_t specialisation
-  TransformImpl<16, 1, true, 2, 2, false>::Transform(
+  TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     reinterpret_cast<uint16_t *>(out),
     reinterpret_cast<const uint16_t *>(in),
     stride*2, x0*2, xmax*2, k0, kmax
@@ -45,7 +45,7 @@ inline void TransformImpl<8, 1, true, 4, 4, false>::Transform(
 // Generic 16x16-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
@@ -137,7 +137,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(con
 
 template <>
 template <>
-inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     uint16_t* out, const uint16_t* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index b825e1c358..e092c729ba 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -21,22 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "a32_interleave_6way_32bit.hpp"
 #include "a32_transpose_interleave_8way_32bit.hpp"
-#include "a64_block16_interleave4_8bit.hpp"
-#include "a64_interleave_8way_16bit.hpp"
-#include "a64_interleave_8way_32bit.hpp"
-#include "a64_interleave_8way_block4_8bit.hpp"
-#include "a64_interleave_8way_half_to_float.hpp"
-#include "a64_interleave_8way_s8_to_s16.hpp"
-#include "a64_interleave_8way_u8_to_u16.hpp"
 #include "a64_transpose_interleave_12way_16bit.hpp"
 #include "a64_transpose_interleave_12way_half_to_float.hpp"
 #include "a64_transpose_interleave_24way_16bit.hpp"
 #include "a64_transpose_interleave_8way_32bit.hpp"
-#include "sve_interleave_8way_32bit.hpp"
-#include "sve_interleave_8way_block2_16bit.hpp"
-#include "sve_interleave_8way_block2_32bit.hpp"
-#include "sve_interleave_8way_block4_16bit.hpp"
-#include "sve_interleave_8way_block4_8bit.hpp"
-#include "sve_interleave_8way_block8_8bit.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
deleted file mode 100644
index 348d78e3f5..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
-    const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-
-    for (int y=y0; y<ymax; y+=8)
-    {
-        const int height = ymax-y;
-        const long inwidth = (kmax - k0);
-        const long outwidth = inwidth * 8;
-        long inpos = 0;
-        long outpos = 0;
-
-        uint32_t *outptr = master_outptr;
-        master_outptr += outwidth;
-
-        const uint32_t *inptr0 = inptr + y * ldin + k0;
-        const uint32_t *inptr1 = inptr0 + ldin;
-        const uint32_t *inptr2 = inptr1 + ldin;
-        const uint32_t *inptr3 = inptr2 + ldin;
-        const uint32_t *inptr4 = inptr3 + ldin;
-        const uint32_t *inptr5 = inptr4 + ldin;
-        const uint32_t *inptr6 = inptr5 + ldin;
-        const uint32_t *inptr7 = inptr6 + ldin;
-
-        switch(height)
-        {
-            case 1:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip1 z0.s, z8.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 2:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "mov z14.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip1 z0.s, z8.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z14.s\n"
-                    "zip2 z7.s, z11.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 3:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "mov z14.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z7.s, z11.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 4:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 5:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z5.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z5.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z5.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z5.s\n"
-                    "zip2 z15.s, z3.s, z5.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 6:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z6.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z6.s\n"
-                    "zip2 z15.s, z3.s, z6.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 7:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z7.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            default:
-            case 8:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
-                    "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-
-        }
-    }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp
deleted file mode 100644
index 234433a0f1..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 2, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint16_t *master_outptr = reinterpret_cast<uint16_t *>(out);
-    const uint16_t *inptr = reinterpret_cast<const uint16_t *>(in);
-
-    for (int y=y0; y<ymax; y+=8)
-    {
-        const int height = ymax-y;
-        const long inwidth = (kmax - k0);
-        const long outwidth = ((inwidth + 1) / 2) * 16;
-        long inpos = 0;
-        long outpos = 0;
-
-        uint16_t *outptr = master_outptr;
-        master_outptr += outwidth;
-
-        const uint16_t *inptr0 = inptr + y * ldin + k0;
-        const uint16_t *inptr1 = inptr0 + ldin;
-        const uint16_t *inptr2 = inptr1 + ldin;
-        const uint16_t *inptr3 = inptr2 + ldin;
-        const uint16_t *inptr4 = inptr3 + ldin;
-        const uint16_t *inptr5 = inptr4 + ldin;
-        const uint16_t *inptr6 = inptr5 + ldin;
-        const uint16_t *inptr7 = inptr6 + ldin;
-
-        switch(height)
-        {
-            case 1:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip1 z0.s, z8.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 2:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "mov z14.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip1 z0.s, z8.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z14.s\n"
-                    "zip2 z7.s, z11.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 3:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "mov z14.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z7.s, z11.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 4:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 5:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z5.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z5.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z5.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z5.s\n"
-                    "zip2 z15.s, z3.s, z5.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 6:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z6.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z6.s\n"
-                    "zip2 z15.s, z3.s, z6.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 7:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z7.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
-                    "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            default:
-            case 8:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
-                    "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
-                    "ld1h z7.h, p0/z, [%[inptr7], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-
-        }
-    }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
deleted file mode 100644
index f21933b8de..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 2, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
-    const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-
-    for (int y=y0; y<ymax; y+=8)
-    {
-        const int height = ymax-y;
-        const long inwidth = (kmax - k0);
-        const long outwidth = ((inwidth + 1) / 2) * 16;
-        long inpos = 0;
-        long outpos = 0;
-
-        uint32_t *outptr = master_outptr;
-        master_outptr += outwidth;
-
-        const uint32_t *inptr0 = inptr + y * ldin + k0;
-        const uint32_t *inptr1 = inptr0 + ldin;
-        const uint32_t *inptr2 = inptr1 + ldin;
-        const uint32_t *inptr3 = inptr2 + ldin;
-        const uint32_t *inptr4 = inptr3 + ldin;
-        const uint32_t *inptr5 = inptr4 + ldin;
-        const uint32_t *inptr6 = inptr5 + ldin;
-        const uint32_t *inptr7 = inptr6 + ldin;
-
-        switch(height)
-        {
-            case 1:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 2:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "mov z14.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 3:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "mov z14.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 4:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 5:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z5.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z5.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z5.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z5.d\n"
-                    "zip2 z15.d, z3.d, z5.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 6:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z6.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z6.d\n"
-                    "zip2 z15.d, z3.d, z6.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 7:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z7.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            default:
-            case 8:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.s, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
-                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
-                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
-                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
-                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
-                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
-                    "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p1.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.s, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.s, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.s, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1w z8.s, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.s, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.s, %[outpos], %[outwidth]\n"
-                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.s, %[outpos], %[outwidth]\n"
-                    "incw %[outpos], all, mul #1\n"
-                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-
-        }
-    }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp
deleted file mode 100644
index 26e10511a6..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 4, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint16_t *master_outptr = reinterpret_cast<uint16_t *>(out);
-    const uint16_t *inptr = reinterpret_cast<const uint16_t *>(in);
-
-    for (int y=y0; y<ymax; y+=8)
-    {
-        const int height = ymax-y;
-        const long inwidth = (kmax - k0);
-        const long outwidth = ((inwidth + 3) / 4) * 32;
-        long inpos = 0;
-        long outpos = 0;
-
-        uint16_t *outptr = master_outptr;
-        master_outptr += outwidth;
-
-        const uint16_t *inptr0 = inptr + y * ldin + k0;
-        const uint16_t *inptr1 = inptr0 + ldin;
-        const uint16_t *inptr2 = inptr1 + ldin;
-        const uint16_t *inptr3 = inptr2 + ldin;
-        const uint16_t *inptr4 = inptr3 + ldin;
-        const uint16_t *inptr5 = inptr4 + ldin;
-        const uint16_t *inptr6 = inptr5 + ldin;
-        const uint16_t *inptr7 = inptr6 + ldin;
-
-        switch(height)
-        {
-            case 1:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 2:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "mov z14.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 3:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "mov z14.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 4:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 5:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z5.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z5.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z5.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z5.d\n"
-                    "zip2 z15.d, z3.d, z5.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 6:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z6.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z6.d\n"
-                    "zip2 z15.d, z3.d, z6.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 7:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z7.h, #0\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
-                    "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            default:
-            case 8:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.h, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
-                    "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
-                    "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
-                    "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
-                    "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
-                    "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
-                    "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
-                    "ld1h z7.h, p0/z, [%[inptr7], %[inpos], LSL #1]\n"
-                    "inch %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p1.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.h, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.h, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.h, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1h z8.h, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.h, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.h, %[outpos], %[outwidth]\n"
-                    "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.h, %[outpos], %[outwidth]\n"
-                    "inch %[outpos], all, mul #1\n"
-                    "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-
-        }
-    }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
deleted file mode 100644
index ed0d58aa91..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint8_t *master_outptr = reinterpret_cast<uint8_t *>(out);
-    const uint8_t *inptr = reinterpret_cast<const uint8_t *>(in);
-
-    for (int y=y0; y<ymax; y+=8)
-    {
-        const int height = ymax-y;
-        const long inwidth = (kmax - k0);
-        const long outwidth = ((inwidth + 3) / 4) * 32;
-        long inpos = 0;
-        long outpos = 0;
-
-        uint8_t *outptr = master_outptr;
-        master_outptr += outwidth;
-
-        const uint8_t *inptr0 = inptr + y * ldin + k0;
-        const uint8_t *inptr1 = inptr0 + ldin;
-        const uint8_t *inptr2 = inptr1 + ldin;
-        const uint8_t *inptr3 = inptr2 + ldin;
-        const uint8_t *inptr4 = inptr3 + ldin;
-        const uint8_t *inptr5 = inptr4 + ldin;
-        const uint8_t *inptr6 = inptr5 + ldin;
-        const uint8_t *inptr7 = inptr6 + ldin;
-
-        switch(height)
-        {
-            case 1:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip1 z0.s, z8.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 2:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "mov z14.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip1 z0.s, z8.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z1.s, z8.s, z4.s\n"
-                    "zip1 z2.s, z9.s, z4.s\n"
-                    "zip2 z3.s, z9.s, z4.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z14.s\n"
-                    "zip2 z7.s, z11.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 3:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "mov z14.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "zip1 z6.s, z11.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z7.s, z11.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 4:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z4.s\n"
-                    "zip1 z12.s, z2.s, z4.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z4.s\n"
-                    "zip2 z15.s, z3.s, z4.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 5:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z5.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z5.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z5.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z5.s\n"
-                    "zip2 z15.s, z3.s, z5.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 6:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z6.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z6.s\n"
-                    "zip2 z15.s, z3.s, z6.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 7:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z7.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
-                    "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            default:
-            case 8:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
-                    "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
-                    "ld1b z7.b, p0/z, [%[inptr7], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "zip1 z0.s, z8.s, z12.s\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.s, z8.s, z12.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.s, z9.s, z13.s\n"
-                    "zip2 z3.s, z9.s, z13.s\n"
-                    "zip1 z4.s, z10.s, z14.s\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.s, z10.s, z14.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.s, z11.s, z15.s\n"
-                    "zip2 z7.s, z11.s, z15.s\n"
-                    "zip1 z8.s, z0.s, z4.s\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.s, z0.s, z4.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.s, z1.s, z5.s\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.s, z1.s, z5.s\n"
-                    "zip1 z12.s, z2.s, z6.s\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.s, z2.s, z6.s\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.s, z3.s, z7.s\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.s, z3.s, z7.s\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-
-        }
-    }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp
deleted file mode 100644
index b4935e6417..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 8, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint8_t *master_outptr = reinterpret_cast<uint8_t *>(out);
-    const uint8_t *inptr = reinterpret_cast<const uint8_t *>(in);
-
-    for (int y=y0; y<ymax; y+=8)
-    {
-        const int height = ymax-y;
-        const long inwidth = (kmax - k0);
-        const long outwidth = ((inwidth + 7) / 8) * 64;
-        long inpos = 0;
-        long outpos = 0;
-
-        uint8_t *outptr = master_outptr;
-        master_outptr += outwidth;
-
-        const uint8_t *inptr0 = inptr + y * ldin + k0;
-        const uint8_t *inptr1 = inptr0 + ldin;
-        const uint8_t *inptr2 = inptr1 + ldin;
-        const uint8_t *inptr3 = inptr2 + ldin;
-        const uint8_t *inptr4 = inptr3 + ldin;
-        const uint8_t *inptr5 = inptr4 + ldin;
-        const uint8_t *inptr6 = inptr5 + ldin;
-        const uint8_t *inptr7 = inptr6 + ldin;
-
-        switch(height)
-        {
-            case 1:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 2:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "mov z14.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 3:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "mov z14.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 4:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z4.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 5:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z5.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z5.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z5.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z5.d\n"
-                    "zip2 z15.d, z3.d, z5.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 6:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z6.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z6.d\n"
-                    "zip2 z15.d, z3.d, z6.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            case 7:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "mov z7.b, #0\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
-                    "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-            default:
-            case 8:
-                __asm __volatile(
-                    "1:\n"
-                    "whilelt p0.b, %[inpos], %[inwidth]\n"
-                    "b.none 2f\n"
-                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
-                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
-                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
-                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
-                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
-                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
-                    "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
-                    "ld1b z7.b, p0/z, [%[inptr7], %[inpos]]\n"
-                    "incb %[inpos], all, mul #1\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p0.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p1.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p2.b, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p3.b, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "whilelt p4.b, %[outpos], %[outwidth]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z5.d\n"
-                    "st1b z8.b, p0, [%[outptr]]\n"
-                    "zip2 z11.d, z1.d, z5.d\n"
-                    "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p5.b, %[outpos], %[outwidth]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "zip2 z15.d, z3.d, z7.d\n"
-                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
-                    "whilelt p6.b, %[outpos], %[outwidth]\n"
-                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
-                    "whilelt p7.b, %[outpos], %[outwidth]\n"
-                    "incb %[outpos], all, mul #1\n"
-                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
-                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
-                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
-                    "addvl %[outptr], %[outptr], #8\n"
-                    "b 1b\n"
-                    "2:\n"
-                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
-                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
-                );
-                break;
-
-
-        }
-    }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
index aac5e19ebe..a3216c494f 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
@@ -23,6 +23,8 @@
  */
 #pragma once
 
+#include "../asmlib.hpp"
+
 template <unsigned int IntBy, typename TIn, typename TOut>
 struct TransposeInterleaveCommon {
   // Override the moveblock_1xY methods to improve performance
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index 6e47a97c78..6d483a3b9d 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -24,6 +24,8 @@
 
 #pragma once
 
+#include "arm_gemm.hpp"
+
 #include <cstddef>
 
 // Macro for unreachable code (e.g. impossible default cases on switch)
@@ -32,6 +34,8 @@
 // Paranoid option for the above with assert
 // #define UNREACHABLE(why)   assert(0 && why)
 
+namespace arm_gemm {
+
 template<typename T>
 inline T iceildiv(const T a, const T b) {
     return (a + b - 1) / b;
@@ -48,7 +52,94 @@ inline T roundup(const T a, const T b) {
     }
 }
 
-namespace arm_gemm {
+enum class VLType {
+    None,
+    SVE,
+};
+
+template<typename T>
+struct IndirectOutputArg {
+    struct {
+        T       *base;
+        size_t   stride;
+    } direct = {};
+    struct {
+        T * const *ptr;
+        size_t     offset;
+    } indirect = {};
+    bool is_indirect;
+
+    // Direct
+    IndirectOutputArg(T *base, size_t stride) : is_indirect(false) {
+        direct.base = base;
+        direct.stride = stride;
+    }
+
+    // Indirect
+    IndirectOutputArg(T * const * ptr, size_t offset) : is_indirect(true) {
+        indirect.ptr = ptr;
+        indirect.offset = offset;
+    }
+
+    IndirectOutputArg() : is_indirect(false) {
+        direct.base = nullptr;
+        direct.stride = 0;
+    }
+};
+
+// Check that the provided Requantize32 doesn't have a left shift.
+inline bool quant_no_left_shift(const Requantize32 &qp) {
+    if (qp.per_channel_requant) {
+        return (qp.per_channel_left_shifts == nullptr);
+    } else {
+        return (qp.per_layer_left_shift == 0);
+    }
+}
+
+// Check that the provided Requantize32 is compatible with the "symmetric" hybrid kernels.  These don't include row
+// sums, so the 'b_offset' has to be zero.
+inline bool quant_hybrid_symmetric(const Requantize32 &qp) {
+    return quant_no_left_shift(qp) && qp.b_offset == 0;
+}
+
+// Check that the provided Requantize32 is compatible with the "asymmetric" hybrid kernels.  These don't support per
+// channel quantization.  Technically b_offset==0 cases would work, but it is a waste to sum and then multiply by 0...
+inline bool quant_hybrid_asymmetric(const Requantize32 &qp) {
+    return quant_no_left_shift(qp) /*  && qp.b_offset != 0 */ && qp.per_channel_requant==false;
+}
+
+template<typename T>
+struct IndirectInputArg {
+    struct {
+        const T *base;
+        size_t   stride;
+    } direct = {};
+    struct {
+        const T * const * const * ptr;
+        unsigned int start_row;
+        unsigned int start_col;
+    } indirect = {};
+    bool is_indirect;
+
+    // Direct
+    IndirectInputArg(const T *base, size_t stride) : is_indirect(false) {
+        direct.base = base;
+        direct.stride = stride;
+    }
+
+    // Indirect
+    IndirectInputArg(const T * const * const *ptr, unsigned int start_row, unsigned int start_col) : is_indirect(true) {
+        indirect.ptr = ptr;
+        indirect.start_row = start_row;
+        indirect.start_col = start_col;
+    }
+
+    IndirectInputArg() : is_indirect(false) {
+        direct.base = nullptr;
+        direct.stride = 0;
+    }
+};
+
 namespace utils {
 namespace {
 
diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
deleted file mode 100644
index b071be3749..0000000000
--- a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/WindowIterator.h"
-
-using namespace arm_compute;
-
-INEGEMMWrapperKernel::INEGEMMWrapperKernel()
-    : _a(nullptr), _b(nullptr), _c(nullptr), _params(), _gemm_info(), _window3d(), _window_shape()
-{
-}
-
-INEGEMMWrapperKernel::Params INEGEMMWrapperKernel::extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c, const GEMMInfo &gemm_info)
-{
-    Params p;
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(b);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(c);
-
-    // Initalize params
-    p.M       = c->info()->tensor_shape().y();
-    p.N       = c->info()->tensor_shape().x();
-    p.K       = a->info()->tensor_shape().x();
-    p.multis  = b->info()->tensor_shape().z();
-    p.batches = c->info()->tensor_shape().total_size_upper(2) / p.multis; //COMPMID-1423: Agree on and document the layout of gemm inputs/outputs
-
-    // Update M in case of GEMM3D for output
-    if(gemm_info.depth_output_gemm3d() != 0)
-    {
-        p.M       = c->info()->tensor_shape().y() * c->info()->tensor_shape().z();
-        p.batches = c->info()->tensor_shape().total_size_upper(3) / p.multis;
-    }
-
-    return p;
-}
-
-void INEGEMMWrapperKernel::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    _gemm_info = gemm_info;
-    _params    = extract_parameters(a, b, c, gemm_info);
-    _a         = a;
-    _b         = b;
-    _c         = c;
-
-    _window3d     = configure_internal(alpha, beta);
-    _window_shape = _window3d.shape();
-
-    // Convert the 3D window into a 1D window in order to allow the scheduler to arbitrary split it.
-    Window collapsed;
-    collapsed.set(0, Window::Dimension(0, _window3d.num_iterations_total()));
-
-    INEKernel::configure(collapsed);
-}
-
-void INEGEMMWrapperKernel::run(const Window &window, const ThreadInfo &info)
-{
-    const Coordinates start_offset = index2coords(_window_shape, window.x().start());
-    const Coordinates end_offset   = index2coords(_window_shape, window.x().end() - 1);
-
-    run_internal(_window3d, start_offset, end_offset, info);
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
similarity index 89%
rename from arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
rename to src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
index 7c10f85824..a956898403 100644
--- a/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
+++ b/src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
+#ifndef SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
+#define SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/INEKernel.h"
 
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
 
 namespace arm_compute
 {
@@ -85,4 +85,4 @@ class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel
     depthwise::IDepthwiseConvolution *_kernel;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H */
+#endif /* SRC_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h b/src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
index 4af82f89a8..7fcf2b1e4d 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
+++ b/src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
 #define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_gemm_compute_iface.hpp"
+#include "src/core/NEON/INEKernel.h"
 
 #include "gemm_common.hpp"
 
diff --git a/src/core/NEON/kernels/assembly/arm_gemm.hpp b/src/core/NEON/kernels/assembly/arm_gemm.hpp
index 58db511547..3088b080d6 100644
--- a/src/core/NEON/kernels/assembly/arm_gemm.hpp
+++ b/src/core/NEON/kernels/assembly/arm_gemm.hpp
@@ -43,7 +43,9 @@ enum class GemmMethod
     GEMM_INTERLEAVED_2D,
     QUANTIZE_WRAPPER,
     QUANTIZE_WRAPPER_2D,
-    GEMM_HYBRID_QUANTIZED
+    GEMM_HYBRID_QUANTIZED,
+    INDIRECT_GEMM,
+    CONVOLUTION_GEMM
 };
 
 struct KernelDescription
@@ -104,17 +106,19 @@ struct GemmArgs
     unsigned int      _Msize;
     unsigned int      _Nsize;
     unsigned int      _Ksize;
+    unsigned int      _Ksections;
     unsigned int      _nbatches;
     unsigned int      _nmulti;
+    bool              _indirect_input;
     Activation        _act;
     int               _maxthreads;
     const GemmConfig *_cfg;
 
-    GemmArgs(const CPUInfo *ci, const unsigned int M, const unsigned int N,
-             const unsigned int K, const unsigned int nbatches,
-             const unsigned int nmulti, Activation act, const int maxthreads,
+    GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N,
+             unsigned int K, unsigned int Ksections, unsigned int nbatches,
+             unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads,
              const GemmConfig *cfg = nullptr)
-        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _act(act), _maxthreads(maxthreads), _cfg(cfg)
+        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _cfg(cfg)
     {
     }
 };
@@ -122,38 +126,41 @@ struct GemmArgs
 struct Requantize32
 {
 public:
-    const int32_t *bias                = nullptr;
-    size_t         bias_multi_stride   = 0;
-    int32_t        a_offset            = 0;
-    int32_t        b_offset            = 0;
-    int32_t        c_offset            = 0;
-    bool           per_channel_requant = false;
-    int32_t        per_layer_shift     = 0;
-    int32_t        per_layer_mul       = 0;
-    const int32_t *per_channel_shifts  = nullptr;
-    const int32_t *per_channel_muls    = nullptr;
-    int32_t        minval              = 0;
-    int32_t        maxval              = 0;
+    const int32_t *bias                     = nullptr;
+    size_t         bias_multi_stride        = 0;
+    int32_t        a_offset                 = 0;
+    int32_t        b_offset                 = 0;
+    int32_t        c_offset                 = 0;
+    bool           per_channel_requant      = false;
+    int32_t        per_layer_left_shift     = 0;
+    int32_t        per_layer_right_shift    = 0;
+    int32_t        per_layer_mul            = 0;
+    const int32_t *per_channel_left_shifts  = nullptr;
+    const int32_t *per_channel_right_shifts = nullptr;
+    const int32_t *per_channel_muls         = nullptr;
+    int32_t        minval                   = 0;
+    int32_t        maxval                   = 0;
 
     Requantize32() = default;
 
     // Constructor for per-tensor quantization
     Requantize32(const int32_t *bias, size_t bias_multi_stride,
                  int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 int32_t requant_shift, int32_t requant_mul,
-                 int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul),
-          minval(minv), maxval(maxv)
+                 int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv)
+        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
+          per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
     {
     }
 
     // Constructor for per-channel quantization
     Requantize32(const int32_t *bias, size_t bias_multi_stride,
                  int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 const int32_t *requant_shifts, const int32_t *requant_muls,
+                 const int32_t *requant_left_shifts,
+                 const int32_t *requant_right_shifts,
+                 const int32_t *requant_muls,
                  int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_shifts(requant_shifts),
-          per_channel_muls(requant_muls), minval(minv), maxval(maxv)
+        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts),
+          per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv)
     {
     }
 };
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp b/src/core/NEON/kernels/assembly/arm_gemm_local.hpp
similarity index 97%
rename from arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
rename to src/core/NEON/kernels/assembly/arm_gemm_local.hpp
index de92cce653..4715f2500a 100644
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
+++ b/src/core/NEON/kernels/assembly/arm_gemm_local.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,3 @@ namespace arm_gemm
 using CPUModel = arm_compute::CPUModel;
 using CPUInfo  = arm_compute::CPUInfo;
 } // namespace arm_compute
-
-
-
diff --git a/src/core/NEON/kernels/assembly/convolution_parameters.hpp b/src/core/NEON/kernels/assembly/convolution_parameters.hpp
new file mode 100644
index 0000000000..d0ef5b539f
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/convolution_parameters.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <cstdint>
+
+namespace arm_gemm
+{
+/*
+ * Parameter set for "convolution" type GEMM.
+ *
+ * For a "convolution" GEMM, the GEMM parameters (M, K) are specified as if
+ * an im2row had been performed on the input tensor to generate the operand
+ * matrix, but instead this structure describes the convolution parameters
+ * such that this can be done on the fly.
+ *
+ * The parameters describe the convolution details - the notional shape of
+ * the input and output tensors, whether padding is to be applied, the size
+ * of the kernel and a constant value to be used for padding (needed for
+ * quantized tensors).
+ *
+ * The second part describes the layout of the input tensor in memory, which
+ * is assumed to be in NHWC format.  This consists of a base pointer and
+ * strides for columns, rows and batches.  'multis' are not supported for
+ * convolution type GEMMs.
+ */
+struct ConvolutionParameters
+{
+    int64_t input_width;
+    int64_t input_height;
+    int64_t input_channels;
+    int64_t kernel_width;
+    int64_t kernel_height;
+    int64_t output_width;
+    int64_t output_height;
+    int64_t output_stride_w;
+    int64_t output_stride_h;
+    //          output_channels not included as they do not affect the input.
+    int64_t padding_top;
+    int64_t padding_left;
+    float   padding_value;
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/assembly/gemm_common.hpp b/src/core/NEON/kernels/assembly/gemm_common.hpp
index e9e56842c7..e1fb7a45a8 100644
--- a/src/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/src/core/NEON/kernels/assembly/gemm_common.hpp
@@ -23,6 +23,7 @@
  */
 #pragma once
 
+#include "convolution_parameters.hpp"
 #include "ndrange.hpp"
 
 #include <cstddef>
@@ -77,7 +78,7 @@ class IGemmCommon
         return false;
     }
 
-    /** Main execute member function
+    /** Main execute member fucntion
      * @param [in] work_range     specifies the range of work we want to be computed, total range defined by get_window_size()
      * @param [in] thread_locator where are we inside of the thread space
      * @naram [in] threadid       a unique threadid
@@ -123,6 +124,19 @@ class IGemmCommon
     {
     }
 
+    /*** Indirect interface (optional) ***/
+    /* Set the indirect table.  This comprises a number of values per kernel point, and a densely packed array of pointers,
+     * multis * batches * kernel_points */
+    virtual void set_indirect_parameters_generic(size_t, const void *const *const *)
+    {
+    }
+
+    /*** Convolution interface (optional) ***/
+    /* Set the convolution parameters. */
+    virtual void set_convolution_parameters(ConvolutionParameters)
+    {
+    }
+
     // Destructor
     virtual ~IGemmCommon()
     {
@@ -200,6 +214,16 @@ class GemmCommon : public IGemmCommon
     {
         pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
     }
+
+    /*** Indirect interface ***/
+    virtual void set_indirect_parameters(size_t, const To *const *const *)
+    {
+    }
+
+    void set_indirect_parameters_generic(size_t sz, const void *const *const *ptr) override
+    {
+        set_indirect_parameters(sz, reinterpret_cast<const To *const *const *>(ptr));
+    }
 };
 
 } // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/convolution/common/activation.hpp b/src/core/NEON/kernels/convolution/common/activation.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/common/activation.hpp
rename to src/core/NEON/kernels/convolution/common/activation.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/common/alloc.hpp b/src/core/NEON/kernels/convolution/common/alloc.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/common/alloc.hpp
rename to src/core/NEON/kernels/convolution/common/alloc.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/common/arm.hpp b/src/core/NEON/kernels/convolution/common/arm.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/common/arm.hpp
rename to src/core/NEON/kernels/convolution/common/arm.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/common/convolution.hpp b/src/core/NEON/kernels/convolution/common/convolution.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/common/convolution.hpp
rename to src/core/NEON/kernels/convolution/common/convolution.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/common/padding.hpp b/src/core/NEON/kernels/convolution/common/padding.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/common/padding.hpp
rename to src/core/NEON/kernels/convolution/common/padding.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/common/perf.h b/src/core/NEON/kernels/convolution/common/perf.h
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/common/perf.h
rename to src/core/NEON/kernels/convolution/common/perf.h
diff --git a/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp b/src/core/NEON/kernels/convolution/common/qasymm8.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp
rename to src/core/NEON/kernels/convolution/common/qasymm8.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp b/src/core/NEON/kernels/convolution/common/qsymm8.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp
rename to src/core/NEON/kernels/convolution/common/qsymm8.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/common/shims.hpp b/src/core/NEON/kernels/convolution/common/shims.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/common/shims.hpp
rename to src/core/NEON/kernels/convolution/common/shims.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp b/src/core/NEON/kernels/convolution/common/tensor.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
rename to src/core/NEON/kernels/convolution/common/tensor.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp b/src/core/NEON/kernels/convolution/common/tensor_utils.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp
rename to src/core/NEON/kernels/convolution/common/tensor_utils.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/common/utils.hpp b/src/core/NEON/kernels/convolution/common/utils.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/common/utils.hpp
rename to src/core/NEON/kernels/convolution/common/utils.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
rename to src/core/NEON/kernels/convolution/depthwise/depthwise.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
rename to src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
rename to src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
similarity index 100%
rename from arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
rename to src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
diff --git a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
rename to src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index 067a18cb62..eef1be06eb 100644
--- a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
 #define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/NEON/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
similarity index 99%
rename from arm_compute/core/NEON/NEColorConvertHelper.inl
rename to src/core/NEON/kernels/detail/NEColorConvertHelper.inl
index 9fc1be5406..ac196d9dbb 100644
--- a/arm_compute/core/NEON/NEColorConvertHelper.inl
+++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
@@ -24,8 +24,8 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IMultiImage.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/Utils.h"
+#include "src/core/NEON/NEMath.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
similarity index 98%
rename from arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
rename to src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
index 41ad8fc706..96defbc9c9 100644
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
rename to src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index 78f08fdca6..59f5c6c6b3 100644
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
@@ -25,10 +25,10 @@
 #ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H
 #define ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/utils/misc/Requires.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "support/Requires.h"
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/floor/impl/fp16_neon_floor.cpp b/src/core/NEON/kernels/floor/impl/fp16_neon_floor.cpp
new file mode 100644
index 0000000000..4f56ca9daf
--- /dev/null
+++ b/src/core/NEON/kernels/floor/impl/fp16_neon_floor.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/core/NEON/NEMath.h"
+#include "src/core/common/StdTypes.h"
+#include "src/core/common/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+constexpr int step = 8;
+
+void fp16_neon_floor(const void *src, void *dst, int len)
+{
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
+    ARM_COMPUTE_ASSERT(len >= 0);
+
+    auto psrc = static_cast<const f16 *>(src);
+    auto pdst = static_cast<f16 *>(dst);
+
+    for(; len >= step; len -= step)
+    {
+        vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc)));
+        psrc += step;
+        pdst += step;
+    }
+
+    for(; len > 0; --len)
+    {
+        *pdst = std::floor(*psrc);
+        ++psrc;
+        ++pdst;
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/tests/validation/reference/LogSoftmaxLayer.h b/src/core/NEON/kernels/floor/impl/fp32_neon_floor.cpp
similarity index 59%
rename from tests/validation/reference/LogSoftmaxLayer.h
rename to src/core/NEON/kernels/floor/impl/fp32_neon_floor.cpp
index db945074a2..3f4b14b3e5 100644
--- a/tests/validation/reference/LogSoftmaxLayer.h
+++ b/src/core/NEON/kernels/floor/impl/fp32_neon_floor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,27 +21,42 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_LOG_SOFTMAX_LAYER_H
-#define ARM_COMPUTE_TEST_LOG_SOFTMAX_LAYER_H
+#include "src/core/NEON/NEMath.h"
+#include "src/core/common/StdTypes.h"
+#include "src/core/common/Validate.h"
 
-#include "tests/SimpleTensor.h"
-#include "tests/validation/Helpers.h"
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
 
 namespace arm_compute
 {
-namespace test
+namespace cpu
 {
-namespace validation
-{
-namespace reference
+constexpr int step = 4;
+
+void fp32_neon_floor(const void *src, void *dst, int len)
 {
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis = 0);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
+    ARM_COMPUTE_ASSERT(len >= 0);
+
+    auto psrc = static_cast<const f32 *>(src);
+    auto pdst = static_cast<f32 *>(dst);
+
+    for(; len >= step; len -= step)
+    {
+        vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
+        psrc += step;
+        pdst += step;
+    }
 
-template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type = 0 >
-SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis = 0);
-} // namespace reference
-} // namespace validation
-} // namespace test
+    for(; len > 0; --len)
+    {
+        *pdst = std::floor(*psrc);
+        ++pdst;
+        ++psrc;
+    }
+}
+} // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_SOFTMAX_LAYER_H */
diff --git a/src/core/NEON/kernels/floor/impl/list.h b/src/core/NEON/kernels/floor/impl/list.h
new file mode 100644
index 0000000000..0eb66e0090
--- /dev/null
+++ b/src/core/NEON/kernels/floor/impl/list.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_FLOOR_LIST_H
+#define SRC_CORE_NEON_KERNELS_FLOOR_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_FLOOR_KERNEL(func_name) \
+    void func_name(const void *src, void *dst, int len)
+
+DECLARE_FLOOR_KERNEL(fp16_neon_floor);
+DECLARE_FLOOR_KERNEL(fp32_neon_floor);
+
+#undef DECLARE_FLOOR_KERNEL
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* SRC_CORE_NEON_KERNELS_FLOOR_LIST_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/abs.h b/src/core/NEON/wrapper/intrinsics/abs.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/abs.h
rename to src/core/NEON/wrapper/intrinsics/abs.h
index 6927fa64a5..0d49a9ebf1 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/abs.h
+++ b/src/core/NEON/wrapper/intrinsics/abs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/add.h b/src/core/NEON/wrapper/intrinsics/add.h
similarity index 99%
rename from arm_compute/core/NEON/wrapper/intrinsics/add.h
rename to src/core/NEON/wrapper/intrinsics/add.h
index 5bca891da5..6134d75b29 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/add.h
+++ b/src/core/NEON/wrapper/intrinsics/add.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/and.h b/src/core/NEON/wrapper/intrinsics/and.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/and.h
rename to src/core/NEON/wrapper/intrinsics/and.h
index 8fffe35b8c..6ff7df3f5a 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/and.h
+++ b/src/core/NEON/wrapper/intrinsics/and.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h b/src/core/NEON/wrapper/intrinsics/bsl.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/bsl.h
rename to src/core/NEON/wrapper/intrinsics/bsl.h
index 6d01b8a685..01c1cce3a6 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h
+++ b/src/core/NEON/wrapper/intrinsics/bsl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h b/src/core/NEON/wrapper/intrinsics/ceq.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/ceq.h
rename to src/core/NEON/wrapper/intrinsics/ceq.h
index a84984d190..b0324e63db 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h
+++ b/src/core/NEON/wrapper/intrinsics/ceq.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cge.h b/src/core/NEON/wrapper/intrinsics/cge.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/cge.h
rename to src/core/NEON/wrapper/intrinsics/cge.h
index ac2973bed4..e4a7fcd423 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/cge.h
+++ b/src/core/NEON/wrapper/intrinsics/cge.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h b/src/core/NEON/wrapper/intrinsics/cgt.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/cgt.h
rename to src/core/NEON/wrapper/intrinsics/cgt.h
index c7ae2caefe..f34d02fd1b 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h
+++ b/src/core/NEON/wrapper/intrinsics/cgt.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cle.h b/src/core/NEON/wrapper/intrinsics/cle.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/intrinsics/cle.h
rename to src/core/NEON/wrapper/intrinsics/cle.h
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/clt.h b/src/core/NEON/wrapper/intrinsics/clt.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/clt.h
rename to src/core/NEON/wrapper/intrinsics/clt.h
index 2d1ea2863e..10fd320e4c 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/clt.h
+++ b/src/core/NEON/wrapper/intrinsics/clt.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/combine.h b/src/core/NEON/wrapper/intrinsics/combine.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/combine.h
rename to src/core/NEON/wrapper/intrinsics/combine.h
index c9d5bf8d90..8b6a588f51 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/combine.h
+++ b/src/core/NEON/wrapper/intrinsics/combine.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/intrinsics/cvt.h
rename to src/core/NEON/wrapper/intrinsics/cvt.h
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/div.h b/src/core/NEON/wrapper/intrinsics/div.h
similarity index 97%
rename from arm_compute/core/NEON/wrapper/intrinsics/div.h
rename to src/core/NEON/wrapper/intrinsics/div.h
index 5731aba469..265f30d33b 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/div.h
+++ b/src/core/NEON/wrapper/intrinsics/div.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_WRAPPER_DIV_H
 #define ARM_COMPUTE_WRAPPER_DIV_H
 
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h b/src/core/NEON/wrapper/intrinsics/dup_n.h
similarity index 96%
rename from arm_compute/core/NEON/wrapper/intrinsics/dup_n.h
rename to src/core/NEON/wrapper/intrinsics/dup_n.h
index 80d4c4074f..e745aa4a8c 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h
+++ b/src/core/NEON/wrapper/intrinsics/dup_n.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_WRAPPER_DUP_N_H
 #define ARM_COMPUTE_WRAPPER_DUP_N_H
 
-#include "arm_compute/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/traits.h"
 
 #include <arm_neon.h>
 
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/eor.h b/src/core/NEON/wrapper/intrinsics/eor.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/eor.h
rename to src/core/NEON/wrapper/intrinsics/eor.h
index 227a743c3d..ce88cf59e7 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/eor.h
+++ b/src/core/NEON/wrapper/intrinsics/eor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/exp.h b/src/core/NEON/wrapper/intrinsics/exp.h
similarity index 97%
rename from arm_compute/core/NEON/wrapper/intrinsics/exp.h
rename to src/core/NEON/wrapper/intrinsics/exp.h
index d50824b132..c2a6970967 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/exp.h
+++ b/src/core/NEON/wrapper/intrinsics/exp.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_WRAPPER_EXP_H
 #define ARM_COMPUTE_WRAPPER_EXP_H
 
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ext.h b/src/core/NEON/wrapper/intrinsics/ext.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/intrinsics/ext.h
rename to src/core/NEON/wrapper/intrinsics/ext.h
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h b/src/core/NEON/wrapper/intrinsics/gethigh.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
rename to src/core/NEON/wrapper/intrinsics/gethigh.h
index d98e129cd9..d098a27335 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
+++ b/src/core/NEON/wrapper/intrinsics/gethigh.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlane.h b/src/core/NEON/wrapper/intrinsics/getlane.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/intrinsics/getlane.h
rename to src/core/NEON/wrapper/intrinsics/getlane.h
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlow.h b/src/core/NEON/wrapper/intrinsics/getlow.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/getlow.h
rename to src/core/NEON/wrapper/intrinsics/getlow.h
index b85b6cabf4..b5469f0eab 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/getlow.h
+++ b/src/core/NEON/wrapper/intrinsics/getlow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/NEON/wrapper/intrinsics/intrinsics.h b/src/core/NEON/wrapper/intrinsics/intrinsics.h
new file mode 100644
index 0000000000..070f3c7065
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/intrinsics.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_INTRINSICS_H
+#define ARM_COMPUTE_WRAPPER_INTRINSICS_H
+
+#include "src/core/NEON/wrapper/intrinsics/abs.h"
+#include "src/core/NEON/wrapper/intrinsics/add.h"
+#include "src/core/NEON/wrapper/intrinsics/and.h"
+#include "src/core/NEON/wrapper/intrinsics/bsl.h"
+#include "src/core/NEON/wrapper/intrinsics/ceq.h"
+#include "src/core/NEON/wrapper/intrinsics/cge.h"
+#include "src/core/NEON/wrapper/intrinsics/cgt.h"
+#include "src/core/NEON/wrapper/intrinsics/cle.h"
+#include "src/core/NEON/wrapper/intrinsics/clt.h"
+#include "src/core/NEON/wrapper/intrinsics/combine.h"
+#include "src/core/NEON/wrapper/intrinsics/cvt.h"
+#include "src/core/NEON/wrapper/intrinsics/div.h"
+#include "src/core/NEON/wrapper/intrinsics/dup_n.h"
+#include "src/core/NEON/wrapper/intrinsics/eor.h"
+#include "src/core/NEON/wrapper/intrinsics/exp.h"
+#include "src/core/NEON/wrapper/intrinsics/ext.h"
+#include "src/core/NEON/wrapper/intrinsics/gethigh.h"
+#include "src/core/NEON/wrapper/intrinsics/getlane.h"
+#include "src/core/NEON/wrapper/intrinsics/getlow.h"
+#include "src/core/NEON/wrapper/intrinsics/inv.h"
+#include "src/core/NEON/wrapper/intrinsics/invsqrt.h"
+#include "src/core/NEON/wrapper/intrinsics/load.h"
+#include "src/core/NEON/wrapper/intrinsics/log.h"
+#include "src/core/NEON/wrapper/intrinsics/max.h"
+#include "src/core/NEON/wrapper/intrinsics/min.h"
+#include "src/core/NEON/wrapper/intrinsics/mla.h"
+#include "src/core/NEON/wrapper/intrinsics/movl.h"
+#include "src/core/NEON/wrapper/intrinsics/movn.h"
+#include "src/core/NEON/wrapper/intrinsics/mul.h"
+#include "src/core/NEON/wrapper/intrinsics/neg.h"
+#include "src/core/NEON/wrapper/intrinsics/not.h"
+#include "src/core/NEON/wrapper/intrinsics/orr.h"
+#include "src/core/NEON/wrapper/intrinsics/pmax.h"
+#include "src/core/NEON/wrapper/intrinsics/pmin.h"
+#include "src/core/NEON/wrapper/intrinsics/pow.h"
+#include "src/core/NEON/wrapper/intrinsics/qmov.h"
+#include "src/core/NEON/wrapper/intrinsics/qmovun.h"
+#include "src/core/NEON/wrapper/intrinsics/reinterpret.h"
+#include "src/core/NEON/wrapper/intrinsics/rev64.h"
+#include "src/core/NEON/wrapper/intrinsics/round.h"
+#include "src/core/NEON/wrapper/intrinsics/setlane.h"
+#include "src/core/NEON/wrapper/intrinsics/sin.h"
+#include "src/core/NEON/wrapper/intrinsics/sqrt.h"
+#include "src/core/NEON/wrapper/intrinsics/store.h"
+#include "src/core/NEON/wrapper/intrinsics/sub.h"
+#include "src/core/NEON/wrapper/intrinsics/tanh.h"
+#include "src/core/NEON/wrapper/intrinsics/tbl.h"
+
+#endif /* ARM_COMPUTE_WRAPPER_INTRINSICS_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/inv.h b/src/core/NEON/wrapper/intrinsics/inv.h
similarity index 96%
rename from arm_compute/core/NEON/wrapper/intrinsics/inv.h
rename to src/core/NEON/wrapper/intrinsics/inv.h
index 889d176670..de398b0403 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/inv.h
+++ b/src/core/NEON/wrapper/intrinsics/inv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_WRAPPER_INV_H
 #define ARM_COMPUTE_WRAPPER_INV_H
 
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h b/src/core/NEON/wrapper/intrinsics/invsqrt.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h
rename to src/core/NEON/wrapper/intrinsics/invsqrt.h
index 8269afe1a2..2343efa8f8 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h
+++ b/src/core/NEON/wrapper/intrinsics/invsqrt.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_WRAPPER_INVSQRT_H
 #define ARM_COMPUTE_WRAPPER_INVSQRT_H
 
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/load.h b/src/core/NEON/wrapper/intrinsics/load.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/load.h
rename to src/core/NEON/wrapper/intrinsics/load.h
index 0fdf705d61..a2116c028b 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/load.h
+++ b/src/core/NEON/wrapper/intrinsics/load.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/log.h b/src/core/NEON/wrapper/intrinsics/log.h
similarity index 97%
rename from arm_compute/core/NEON/wrapper/intrinsics/log.h
rename to src/core/NEON/wrapper/intrinsics/log.h
index 83de420f91..357a77ca78 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/log.h
+++ b/src/core/NEON/wrapper/intrinsics/log.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_WRAPPER_LOG_H
 #define ARM_COMPUTE_WRAPPER_LOG_H
 
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/max.h b/src/core/NEON/wrapper/intrinsics/max.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/max.h
rename to src/core/NEON/wrapper/intrinsics/max.h
index 7e52089b56..cec437d171 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/max.h
+++ b/src/core/NEON/wrapper/intrinsics/max.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/min.h b/src/core/NEON/wrapper/intrinsics/min.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/min.h
rename to src/core/NEON/wrapper/intrinsics/min.h
index b287598375..8afcb3cb10 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/min.h
+++ b/src/core/NEON/wrapper/intrinsics/min.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/mla.h b/src/core/NEON/wrapper/intrinsics/mla.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/mla.h
rename to src/core/NEON/wrapper/intrinsics/mla.h
index 2c89cfdcff..2b38b34137 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/mla.h
+++ b/src/core/NEON/wrapper/intrinsics/mla.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movl.h b/src/core/NEON/wrapper/intrinsics/movl.h
similarity index 97%
rename from arm_compute/core/NEON/wrapper/intrinsics/movl.h
rename to src/core/NEON/wrapper/intrinsics/movl.h
index fd97a44841..99f2150eab 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/movl.h
+++ b/src/core/NEON/wrapper/intrinsics/movl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movn.h b/src/core/NEON/wrapper/intrinsics/movn.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/movn.h
rename to src/core/NEON/wrapper/intrinsics/movn.h
index ed3b159fa2..460c277540 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/movn.h
+++ b/src/core/NEON/wrapper/intrinsics/movn.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/mul.h b/src/core/NEON/wrapper/intrinsics/mul.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/mul.h
rename to src/core/NEON/wrapper/intrinsics/mul.h
index 88ea87aeef..6296fff35a 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/mul.h
+++ b/src/core/NEON/wrapper/intrinsics/mul.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/neg.h b/src/core/NEON/wrapper/intrinsics/neg.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/neg.h
rename to src/core/NEON/wrapper/intrinsics/neg.h
index c0c73dcaaf..5e4556664e 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/neg.h
+++ b/src/core/NEON/wrapper/intrinsics/neg.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/not.h b/src/core/NEON/wrapper/intrinsics/not.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/not.h
rename to src/core/NEON/wrapper/intrinsics/not.h
index 084b2a4944..5853e849a2 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/not.h
+++ b/src/core/NEON/wrapper/intrinsics/not.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/orr.h b/src/core/NEON/wrapper/intrinsics/orr.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/orr.h
rename to src/core/NEON/wrapper/intrinsics/orr.h
index 13979fe539..cc83e95d15 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/orr.h
+++ b/src/core/NEON/wrapper/intrinsics/orr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h b/src/core/NEON/wrapper/intrinsics/pmax.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/pmax.h
rename to src/core/NEON/wrapper/intrinsics/pmax.h
index ba8d9cc6c4..cd2b2d1f41 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h
+++ b/src/core/NEON/wrapper/intrinsics/pmax.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h b/src/core/NEON/wrapper/intrinsics/pmin.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/pmin.h
rename to src/core/NEON/wrapper/intrinsics/pmin.h
index 45e64a834a..59b6be69ce 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h
+++ b/src/core/NEON/wrapper/intrinsics/pmin.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pow.h b/src/core/NEON/wrapper/intrinsics/pow.h
similarity index 95%
rename from arm_compute/core/NEON/wrapper/intrinsics/pow.h
rename to src/core/NEON/wrapper/intrinsics/pow.h
index bffbc4f7b2..61f834ed23 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/pow.h
+++ b/src/core/NEON/wrapper/intrinsics/pow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_WRAPPER_POW_H
 #define ARM_COMPUTE_WRAPPER_POW_H
 
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/qmov.h b/src/core/NEON/wrapper/intrinsics/qmov.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/intrinsics/qmov.h
rename to src/core/NEON/wrapper/intrinsics/qmov.h
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h b/src/core/NEON/wrapper/intrinsics/qmovun.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/intrinsics/qmovun.h
rename to src/core/NEON/wrapper/intrinsics/qmovun.h
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h b/src/core/NEON/wrapper/intrinsics/reinterpret.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h
rename to src/core/NEON/wrapper/intrinsics/reinterpret.h
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/rev64.h b/src/core/NEON/wrapper/intrinsics/rev64.h
similarity index 98%
rename from arm_compute/core/NEON/wrapper/intrinsics/rev64.h
rename to src/core/NEON/wrapper/intrinsics/rev64.h
index 1119c34654..0f0139c93b 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/rev64.h
+++ b/src/core/NEON/wrapper/intrinsics/rev64.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/round.h b/src/core/NEON/wrapper/intrinsics/round.h
similarity index 97%
rename from arm_compute/core/NEON/wrapper/intrinsics/round.h
rename to src/core/NEON/wrapper/intrinsics/round.h
index dd068ea709..d23feb6b42 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/round.h
+++ b/src/core/NEON/wrapper/intrinsics/round.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_WRAPPER_ROUND_H
 #define ARM_COMPUTE_WRAPPER_ROUND_H
 
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/setlane.h b/src/core/NEON/wrapper/intrinsics/setlane.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/intrinsics/setlane.h
rename to src/core/NEON/wrapper/intrinsics/setlane.h
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/sin.h b/src/core/NEON/wrapper/intrinsics/sin.h
similarity index 96%
rename from arm_compute/core/NEON/wrapper/intrinsics/sin.h
rename to src/core/NEON/wrapper/intrinsics/sin.h
index 7c9cc468ed..03c2813a32 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/sin.h
+++ b/src/core/NEON/wrapper/intrinsics/sin.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_WRAPPER_SIN_H
 #define ARM_COMPUTE_WRAPPER_SIN_H
 
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/sqrt.h b/src/core/NEON/wrapper/intrinsics/sqrt.h
new file mode 100644
index 0000000000..11954cf6c9
--- /dev/null
+++ b/src/core/NEON/wrapper/intrinsics/sqrt.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_SQRT_H
+#define ARM_COMPUTE_WRAPPER_SQRT_H
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VSQRT_IMPL(type, prefix, postfix) \
+    inline type vsqrt(const type &a)      \
+    {                                     \
+        return prefix##_##postfix(a);     \
+    }
+
+VSQRT_IMPL(float32x2_t, vsqrt, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSQRT_IMPL(float16x4_t, vsqrt, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VSQRT_IMPL(float32x4_t, vsqrtq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VSQRT_IMPL(float16x8_t, vsqrtq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+} // namespace wrapper
+} // namespace arm_compute
+
+#endif // __aarch64__
+
+#endif /* ARM_COMPUTE_WRAPPER_SQRT_H */
\ No newline at end of file
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/store.h b/src/core/NEON/wrapper/intrinsics/store.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/intrinsics/store.h
rename to src/core/NEON/wrapper/intrinsics/store.h
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/sub.h b/src/core/NEON/wrapper/intrinsics/sub.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/intrinsics/sub.h
rename to src/core/NEON/wrapper/intrinsics/sub.h
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/tanh.h b/src/core/NEON/wrapper/intrinsics/tanh.h
similarity index 95%
rename from arm_compute/core/NEON/wrapper/intrinsics/tanh.h
rename to src/core/NEON/wrapper/intrinsics/tanh.h
index 2943b9b1ea..daeaf19997 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/tanh.h
+++ b/src/core/NEON/wrapper/intrinsics/tanh.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_WRAPPER_TANH_H
 #define ARM_COMPUTE_WRAPPER_TANH_H
 
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/tbl.h b/src/core/NEON/wrapper/intrinsics/tbl.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/intrinsics/tbl.h
rename to src/core/NEON/wrapper/intrinsics/tbl.h
diff --git a/arm_compute/core/NEON/wrapper/scalar/add.h b/src/core/NEON/wrapper/scalar/add.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/scalar/add.h
rename to src/core/NEON/wrapper/scalar/add.h
diff --git a/arm_compute/core/NEON/wrapper/scalar/scalar.h b/src/core/NEON/wrapper/scalar/scalar.h
similarity index 92%
rename from arm_compute/core/NEON/wrapper/scalar/scalar.h
rename to src/core/NEON/wrapper/scalar/scalar.h
index 1bc50c2740..8be37e55ba 100644
--- a/arm_compute/core/NEON/wrapper/scalar/scalar.h
+++ b/src/core/NEON/wrapper/scalar/scalar.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_WRAPPER_SCALAR_H
 #define ARM_COMPUTE_WRAPPER_SCALAR_H
 
-#include "arm_compute/core/NEON/wrapper/scalar/add.h"
-#include "arm_compute/core/NEON/wrapper/scalar/sub.h"
+#include "src/core/NEON/wrapper/scalar/add.h"
+#include "src/core/NEON/wrapper/scalar/sub.h"
 
 #endif /* ARM_COMPUTE_WRAPPER_SCALAR_H */
diff --git a/arm_compute/core/NEON/wrapper/scalar/sub.h b/src/core/NEON/wrapper/scalar/sub.h
similarity index 92%
rename from arm_compute/core/NEON/wrapper/scalar/sub.h
rename to src/core/NEON/wrapper/scalar/sub.h
index 9abda26224..1fe51d75fc 100644
--- a/arm_compute/core/NEON/wrapper/scalar/sub.h
+++ b/src/core/NEON/wrapper/scalar/sub.h
@@ -44,6 +44,13 @@ inline int16_t sub_sat(const int16_t &a, const int16_t &b)
     return vget_lane_s16(vqsub_s16(va, vb), 0);
 }
 
+inline int32_t sub_sat(const int32_t &a, const int32_t &b)
+{
+    const int32x2_t va = { a, 0 };
+    const int32x2_t vb = { b, 0 };
+    return vget_lane_s32(vqsub_s32(va, vb), 0);
+}
+
 inline float sub_sat(const float &a, const float &b)
 {
     // No notion of saturation exists in floating point
diff --git a/arm_compute/core/NEON/wrapper/traits.h b/src/core/NEON/wrapper/traits.h
similarity index 100%
rename from arm_compute/core/NEON/wrapper/traits.h
rename to src/core/NEON/wrapper/traits.h
diff --git a/arm_compute/core/NEON/wrapper/wrapper.h b/src/core/NEON/wrapper/wrapper.h
similarity index 85%
rename from arm_compute/core/NEON/wrapper/wrapper.h
rename to src/core/NEON/wrapper/wrapper.h
index e0c290887b..e5467e98ff 100644
--- a/arm_compute/core/NEON/wrapper/wrapper.h
+++ b/src/core/NEON/wrapper/wrapper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,10 +25,10 @@
 #define ARM_COMPUTE_WRAPPER_H
 
 // Traits
-#include "arm_compute/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/traits.h"
 
 // Intrinsics Overloads
-#include "arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "arm_compute/core/NEON/wrapper/scalar/scalar.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/core/NEON/wrapper/scalar/scalar.h"
 
 #endif /* ARM_COMPUTE_WRAPPER_H */
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 0971d2aa73..414c128a27 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/Utils.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
@@ -268,7 +269,7 @@ std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(c
 
             const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
 
-            required_total_size = _tensor_shape[idx_last_dimension] * required_strides[idx_last_dimension];
+            required_total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension];
             break;
         }
     }
@@ -360,7 +361,7 @@ ITensorInfo &TensorInfo::set_tensor_shape(const TensorShape &shape)
     else
     {
         const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
-        _total_size                           = _tensor_shape[idx_last_dimension] * _strides_in_bytes[idx_last_dimension];
+        _total_size                           = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension];
     }
 
     std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
diff --git a/src/core/TracePoint.cpp b/src/core/TracePoint.cpp
index 06d9527486..d67faad868 100644
--- a/src/core/TracePoint.cpp
+++ b/src/core/TracePoint.cpp
@@ -33,12 +33,12 @@
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
 #include "utils/TypePrinter.h"
 
 #include <array>
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index cec7a1b4ba..babf1c4b91 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -33,9 +33,9 @@
 #include <map>
 #include <string>
 
-using namespace arm_compute;
-
-std::string arm_compute::read_file(const std::string &filename, bool binary)
+namespace arm_compute
+{
+std::string read_file(const std::string &filename, bool binary)
 {
     std::string   out;
     std::ifstream fs;
@@ -73,7 +73,7 @@ std::string arm_compute::read_file(const std::string &filename, bool binary)
     return out;
 }
 
-const std::string &arm_compute::string_from_format(Format format)
+const std::string &string_from_format(Format format)
 {
     static std::map<Format, const std::string> formats_map =
     {
@@ -99,7 +99,7 @@ const std::string &arm_compute::string_from_format(Format format)
     return formats_map[format];
 }
 
-const std::string &arm_compute::string_from_channel(Channel channel)
+const std::string &string_from_channel(Channel channel)
 {
     static std::map<Channel, const std::string> channels_map =
     {
@@ -120,7 +120,7 @@ const std::string &arm_compute::string_from_channel(Channel channel)
     return channels_map[channel];
 }
 
-const std::string &arm_compute::string_from_data_layout(DataLayout dl)
+const std::string &string_from_data_layout(DataLayout dl)
 {
     static std::map<DataLayout, const std::string> dl_map =
     {
@@ -132,7 +132,7 @@ const std::string &arm_compute::string_from_data_layout(DataLayout dl)
     return dl_map[dl];
 }
 
-const std::string &arm_compute::string_from_data_type(DataType dt)
+const std::string &string_from_data_type(DataType dt)
 {
     static std::map<DataType, const std::string> dt_map =
     {
@@ -160,7 +160,7 @@ const std::string &arm_compute::string_from_data_type(DataType dt)
     return dt_map[dt];
 }
 
-const std::string &arm_compute::string_from_activation_func(ActivationLayerInfo::ActivationFunction act)
+const std::string &string_from_activation_func(ActivationLayerInfo::ActivationFunction act)
 {
     static std::map<ActivationLayerInfo::ActivationFunction, const std::string> act_map =
     {
@@ -184,7 +184,7 @@ const std::string &arm_compute::string_from_activation_func(ActivationLayerInfo:
     return act_map[act];
 }
 
-const std::string &arm_compute::string_from_matrix_pattern(MatrixPattern pattern)
+const std::string &string_from_matrix_pattern(MatrixPattern pattern)
 {
     static std::map<MatrixPattern, const std::string> pattern_map =
     {
@@ -197,7 +197,7 @@ const std::string &arm_compute::string_from_matrix_pattern(MatrixPattern pattern
     return pattern_map[pattern];
 }
 
-const std::string &arm_compute::string_from_non_linear_filter_function(NonLinearFilterFunction function)
+const std::string &string_from_non_linear_filter_function(NonLinearFilterFunction function)
 {
     static std::map<NonLinearFilterFunction, const std::string> func_map =
     {
@@ -209,7 +209,7 @@ const std::string &arm_compute::string_from_non_linear_filter_function(NonLinear
     return func_map[function];
 }
 
-const std::string &arm_compute::string_from_interpolation_policy(InterpolationPolicy policy)
+const std::string &string_from_interpolation_policy(InterpolationPolicy policy)
 {
     static std::map<InterpolationPolicy, const std::string> interpolation_policy_map =
     {
@@ -221,7 +221,7 @@ const std::string &arm_compute::string_from_interpolation_policy(InterpolationPo
     return interpolation_policy_map[policy];
 }
 
-const std::string &arm_compute::string_from_border_mode(BorderMode border_mode)
+const std::string &string_from_border_mode(BorderMode border_mode)
 {
     static std::map<BorderMode, const std::string> border_mode_map =
     {
@@ -233,7 +233,7 @@ const std::string &arm_compute::string_from_border_mode(BorderMode border_mode)
     return border_mode_map[border_mode];
 }
 
-const std::string &arm_compute::string_from_norm_type(NormType type)
+const std::string &string_from_norm_type(NormType type)
 {
     static std::map<NormType, const std::string> norm_type_map =
     {
@@ -245,7 +245,7 @@ const std::string &arm_compute::string_from_norm_type(NormType type)
     return norm_type_map[type];
 }
 
-const std::string &arm_compute::string_from_pooling_type(PoolingType type)
+const std::string &string_from_pooling_type(PoolingType type)
 {
     static std::map<PoolingType, const std::string> pool_type_map =
     {
@@ -257,7 +257,7 @@ const std::string &arm_compute::string_from_pooling_type(PoolingType type)
     return pool_type_map[type];
 }
 
-const std::string &arm_compute::string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage)
+const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage)
 {
     static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map =
     {
@@ -270,7 +270,7 @@ const std::string &arm_compute::string_from_gemmlowp_output_stage(GEMMLowpOutput
     return output_stage_map[output_stage];
 }
 
-std::string arm_compute::string_from_pixel_value(const PixelValue &value, const DataType data_type)
+std::string string_from_pixel_value(const PixelValue &value, const DataType data_type)
 {
     std::stringstream ss;
     std::string       converted_string;
@@ -323,21 +323,45 @@ std::string arm_compute::string_from_pixel_value(const PixelValue &value, const
     return converted_string;
 }
 
-std::string arm_compute::lower_string(const std::string &val)
+DataType data_type_from_name(const std::string &name)
+{
+    static const std::map<std::string, DataType> data_types =
+    {
+        { "f16", DataType::F16 },
+        { "f32", DataType::F32 },
+        { "qasymm8", DataType::QASYMM8 },
+    };
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+    try
+    {
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+        return data_types.at(utility::tolower(name));
+
+#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
+    }
+    catch(const std::out_of_range &)
+    {
+        ARM_COMPUTE_ERROR_VAR("Invalid data type name: %s", name.c_str());
+    }
+#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+}
+
+std::string lower_string(const std::string &val)
 {
     std::string res = val;
     std::transform(res.begin(), res.end(), res.begin(), ::tolower);
     return res;
 }
 
-PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout, const Size2D &dilation,
-                                              const DimensionRoundingType &rounding_type)
+PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout, const Size2D &dilation,
+                                 const DimensionRoundingType &rounding_type)
 {
     const auto &strides = conv_info.stride();
     ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1), "Stride values should be greater than or equal to 1.");
 
-    const unsigned int width_idx     = arm_compute::get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx    = arm_compute::get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int width_idx     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const unsigned int in_width      = input_shape[width_idx];
     const unsigned int in_height     = input_shape[height_idx];
     const unsigned int kernel_width  = weights_shape[width_idx];
@@ -372,9 +396,9 @@ PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorSha
     return same_info;
 }
 
-std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
-                                                                                   unsigned int kernel_width, unsigned int kernel_height,
-                                                                                   const PadStrideInfo &pad_stride_info)
+std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
+                                                                      unsigned int kernel_width, unsigned int kernel_height,
+                                                                      const PadStrideInfo &pad_stride_info)
 {
     const unsigned int pad_left   = pad_stride_info.pad_left();
     const unsigned int pad_top    = pad_stride_info.pad_top();
@@ -392,10 +416,10 @@ std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_dimensio
     return std::make_pair<unsigned int, unsigned int>(w, h);
 }
 
-std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(int width, int height,
-                                                                     int kernel_width, int kernel_height,
-                                                                     const PadStrideInfo &pad_stride_info,
-                                                                     const Size2D        &dilation)
+std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
+                                                        int kernel_width, int kernel_height,
+                                                        const PadStrideInfo &pad_stride_info,
+                                                        const Size2D        &dilation)
 {
     const int dilation_x = dilation.x();
     const int dilation_y = dilation.y();
@@ -426,7 +450,7 @@ std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(int width,
     return std::make_pair(static_cast<unsigned int>(w), static_cast<unsigned int>(h));
 }
 
-bool arm_compute::needs_serialized_reduction(ReductionOperation op, DataType dt, unsigned int axis)
+bool needs_serialized_reduction(ReductionOperation op, DataType dt, unsigned int axis)
 {
     const bool is_min_max        = (op == ReductionOperation::MAX || op == ReductionOperation::MIN);
     const bool is_quantized_type = is_data_type_quantized(dt);
@@ -435,7 +459,7 @@ bool arm_compute::needs_serialized_reduction(ReductionOperation op, DataType dt,
     return !is_first_dim || is_min_max || is_quantized_type;
 }
 
-QuantizationInfo arm_compute::get_softmax_output_quantization_info(DataType input_type, bool is_log)
+QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool is_log)
 {
     // Note: Output quantization info for softmax should always have
     // * Softmax with QASYMM8: scale = 1/256, offset = 0
@@ -456,7 +480,7 @@ QuantizationInfo arm_compute::get_softmax_output_quantization_info(DataType inpu
     return QuantizationInfo(1.f / 256, 0);
 }
 
-std::pair<int32_t, int32_t> arm_compute::get_quantized_activation_min_max(ActivationLayerInfo act_info, DataType data_type, UniformQuantizationInfo oq_info)
+std::pair<int32_t, int32_t> get_quantized_activation_min_max(ActivationLayerInfo act_info, DataType data_type, UniformQuantizationInfo oq_info)
 {
     const bool is_qasymm8_signed = is_data_type_quantized_asymmetric_signed(data_type);
     const auto a                 = act_info.a();
@@ -471,8 +495,47 @@ std::pair<int32_t, int32_t> arm_compute::get_quantized_activation_min_max(Activa
     return std::make_pair(min_activation, max_activation);
 }
 
+std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initializer_list<const ITensor *> tensors)
+{
+    std::unordered_map<const ITensorInfo *, PaddingSize> res;
+
+    for(const ITensor *tensor : tensors)
+    {
+        if(tensor)
+        {
+            res.insert({ tensor->info(), tensor->info()->padding() });
+        }
+    }
+
+    return res;
+}
+
+std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initializer_list<const ITensorInfo *> infos)
+{
+    std::unordered_map<const ITensorInfo *, PaddingSize> res;
+
+    for(const ITensorInfo *info : infos)
+    {
+        if(info)
+        {
+            res.insert({ info, info->padding() });
+        }
+    }
+
+    return res;
+}
+
+bool has_padding_changed(const std::unordered_map<const ITensorInfo *, PaddingSize> &padding_map)
+{
+    return std::find_if(padding_map.begin(), padding_map.end(), [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info)
+    {
+        return (padding_info.first->padding() != padding_info.second);
+    })
+    != padding_map.end();
+}
+
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
-void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim)
+void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim)
 {
     switch(dt)
     {
@@ -514,7 +577,7 @@ void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const
     }
 }
 
-int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n)
+int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n)
 {
     switch(dt)
     {
@@ -548,3 +611,5 @@ int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataTyp
     return 0;
 }
 #endif /* ARM_COMPUTE_ASSERTS_ENABLED */
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
new file mode 100644
index 0000000000..dcea3e8d38
--- /dev/null
+++ b/src/core/common/Registrars.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_COMMON_REGISTRARS_H
+#define SRC_CORE_COMMON_REGISTRARS_H
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#define REGISTER_FP16_NEON(func_name) &(func_name)
+#else /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+#define REGISTER_FP16_NEON(func_name) nullptr
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+
+#if defined(ENABLE_FP32_KERNELS)
+#define REGISTER_FP32_NEON(func_name) &(func_name)
+#else /* defined(ENABLE_FP32_KERNELS) */
+#define REGISTER_FP32_NEON(func_name) nullptr
+#endif /* defined(ENABLE_FP32_KERNELS) */
+
+#if defined(ENABLE_QASYMM8_SIGNED_KERNELS)
+#define REGISTER_QASYMM8_SIGNED_NEON(func_name) &(func_name)
+#else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
+#define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr
+#endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
+
+#if defined(ENABLE_QASYMM8_KERNELS)
+#define REGISTER_QASYMM8_NEON(func_name) &(func_name)
+#else /* defined(ENABLE_QASYMM8_KERNELS) */
+#define REGISTER_QASYMM8_NEON(func_name) nullptr
+#endif /* defined(ENABLE_QASYMM8_KERNELS) */
+
+#if defined(ENABLE_QSYMM16_KERNELS)
+#define REGISTER_QSYMM16_NEON(func_name) &(func_name)
+#else /* defined(ENABLE_QSYMM16_KERNELS) */
+#define REGISTER_QSYMM16_NEON(func_name) nullptr
+#endif /* defined(ENABLE_QSYMM16_KERNELS) */
+
+#endif /* SRC_CORE_COMMON_REGISTRARS_H */
diff --git a/src/core/common/StdTypes.h b/src/core/common/StdTypes.h
new file mode 100644
index 0000000000..3fba6187a3
--- /dev/null
+++ b/src/core/common/StdTypes.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_COMMON_STDTYPES_H
+#define SRC_CORE_COMMON_STDTYPES_H
+
+#include <cstdint>
+
+namespace arm_compute
+{
+using u8  = uint8_t;
+using s8  = int8_t;
+using u16 = uint16_t;
+using s16 = int16_t;
+using u32 = uint32_t;
+using s32 = int32_t;
+using f32 = float;
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+using f16 = __fp16;
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+} // namespace arm_compute
+
+#endif /* SRC_CORE_COMMON_STDTYPES_H */
diff --git a/src/core/common/Validate.h b/src/core/common/Validate.h
new file mode 100644
index 0000000000..fa24bf5fa7
--- /dev/null
+++ b/src/core/common/Validate.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SRC_CORE_COMMON_VALIDATE_H
+#define SRC_CORE_COMMON_VALIDATE_H
+
+#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
+
+#include <cassert>
+
+#define ARM_COMPUTE_ASSERT(cond) assert(cond)
+#define ARM_COMPUTE_ASSERT_NOT_NULLPTR(ptr) assert((ptr) != nullptr)
+
+#else /* defined(ARM_COMPUTE_ASSERTS_ENABLED) */
+
+#define ARM_COMPUTE_ASSERT(cond)
+#define ARM_COMPUTE_ASSERT_NOT_NULLPTR(ptr)
+
+#endif /* defined(ARM_COMPUTE_ASSERTS_ENABLED) */
+#endif /* SRC_CORE_COMMON_VALIDATE_H */
diff --git a/src/core/helpers/AutoConfiguration.h b/src/core/helpers/AutoConfiguration.h
new file mode 100644
index 0000000000..6880a6cb66
--- /dev/null
+++ b/src/core/helpers/AutoConfiguration.h
@@ -0,0 +1,176 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_AUTOCONFIGURATION_H
+#define SRC_CORE_HELPERS_AUTOCONFIGURATION_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+/** Auto initialize the tensor info (shape, number of channels and data type) if the current assignment is empty.
+ *
+ * @param[in,out] info              Tensor info used to check and assign.
+ * @param[in]     shape             New shape.
+ * @param[in]     num_channels      New number of channels.
+ * @param[in]     data_type         New data type
+ * @param[in]     quantization_info (Optional) New quantization info
+ *
+ * @return True if the tensor info has been initialized
+ */
+inline bool auto_init_if_empty(ITensorInfo       &info,
+                               const TensorShape &shape,
+                               int num_channels, DataType data_type,
+                               QuantizationInfo quantization_info = QuantizationInfo())
+{
+    if(info.tensor_shape().total_size() == 0)
+    {
+        info.set_data_type(data_type);
+        info.set_num_channels(num_channels);
+        info.set_tensor_shape(shape);
+        info.set_quantization_info(quantization_info);
+        return true;
+    }
+
+    return false;
+}
+
+/** Auto initialize the tensor info using another tensor info.
+*
+* @param info_sink   Tensor info used to check and assign
+* @param info_source Tensor info used to assign
+*
+* @return True if the tensor info has been initialized
+*/
+inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source)
+{
+    if(info_sink.tensor_shape().total_size() == 0)
+    {
+        info_sink.set_data_type(info_source.data_type());
+        info_sink.set_num_channels(info_source.num_channels());
+        info_sink.set_tensor_shape(info_source.tensor_shape());
+        info_sink.set_quantization_info(info_source.quantization_info());
+        info_sink.set_data_layout(info_source.data_layout());
+        return true;
+    }
+
+    return false;
+}
+
+/** Set the shape to the specified value if the current assignment is empty.
+ *
+ * @param[in,out] info  Tensor info used to check and assign.
+ * @param[in]     shape New shape.
+ *
+ * @return True if the shape has been changed.
+ */
+inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
+{
+    if(info.tensor_shape().total_size() == 0)
+    {
+        info.set_tensor_shape(shape);
+        return true;
+    }
+
+    return false;
+}
+
+/** Set the format, data type and number of channels to the specified value if
+ * the current data type is unknown.
+ *
+ * @param[in,out] info   Tensor info used to check and assign.
+ * @param[in]     format New format.
+ *
+ * @return True if the format has been changed.
+ */
+inline bool set_format_if_unknown(ITensorInfo &info, Format format)
+{
+    if(info.data_type() == DataType::UNKNOWN)
+    {
+        info.set_format(format);
+        return true;
+    }
+
+    return false;
+}
+
+/** Set the data type and number of channels to the specified value if
+ * the current data type is unknown.
+ *
+ * @param[in,out] info      Tensor info used to check and assign.
+ * @param[in]     data_type New data type.
+ *
+ * @return True if the data type has been changed.
+ */
+inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
+{
+    if(info.data_type() == DataType::UNKNOWN)
+    {
+        info.set_data_type(data_type);
+        return true;
+    }
+
+    return false;
+}
+
+/** Set the data layout to the specified value if
+ * the current data layout is unknown.
+ *
+ * @param[in,out] info        Tensor info used to check and assign.
+ * @param[in]     data_layout New data layout.
+ *
+ * @return True if the data type has been changed.
+ */
+inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout)
+{
+    if(info.data_layout() == DataLayout::UNKNOWN)
+    {
+        info.set_data_layout(data_layout);
+        return true;
+    }
+
+    return false;
+}
+
+/** Set the quantization info to the specified value if
+ * the current quantization info is empty and the data type of asymmetric quantized type
+ *
+ * @param[in,out] info              Tensor info used to check and assign.
+ * @param[in]     quantization_info Quantization info
+ *
+ * @return True if the quantization info has been changed.
+ */
+inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
+{
+    if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
+    {
+        info.set_quantization_info(quantization_info);
+        return true;
+    }
+
+    return false;
+}
+} // namespace arm_compute
+
+#endif /* SRC_CORE_HELPERS_AUTOCONFIGURATION_H */
diff --git a/src/core/helpers/NormalizationHelpers.h b/src/core/helpers/NormalizationHelpers.h
new file mode 100644
index 0000000000..d94d5e3602
--- /dev/null
+++ b/src/core/helpers/NormalizationHelpers.h
@@ -0,0 +1,47 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_NORMALIZATIONHELPERS_H
+#define SRC_CORE_HELPERS_NORMALIZATIONHELPERS_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+/** Calculate the normalization dimension index for a given normalization type
+ *
+ * @param[in] layout Data layout of the input and output tensor
+ * @param[in] info   Normalization info
+ *
+ * @return Normalization dimension index
+ */
+inline unsigned int get_normalization_dimension_index(DataLayout layout, const NormalizationLayerInfo &info)
+{
+    const unsigned int width_idx   = get_data_layout_dimension_index(layout, DataLayoutDimension::WIDTH);
+    const unsigned int channel_idx = get_data_layout_dimension_index(layout, DataLayoutDimension::CHANNEL);
+
+    return info.is_in_map() ? width_idx : channel_idx;
+}
+} // namespace arm_compute
+#endif /* SRC_CORE_HELPERS_NORMALIZATIONHELPERS_H */
diff --git a/src/core/helpers/ScaleHelpers.h b/src/core/helpers/ScaleHelpers.h
new file mode 100644
index 0000000000..827bbef4cd
--- /dev/null
+++ b/src/core/helpers/ScaleHelpers.h
@@ -0,0 +1,331 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_SCALEHELPERS_H
+#define SRC_CORE_HELPERS_SCALEHELPERS_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/QuantizationInfo.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace scale_helpers
+{
+/** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
+ * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
+ * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
+ * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
+ *
+ * @note dx and dy must be in the range [0, 1.0]
+ *
+ * @return The bilinear interpolated pixel value
+ */
+template <typename T>
+inline T delta_bilinear_c1(const T *pixel_ptr, size_t stride, float dx, float dy)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dx1 = 1.0f - dx;
+    const float dy1 = 1.0f - dy;
+
+    const T a00 = *pixel_ptr;
+    const T a01 = *(pixel_ptr + 1);
+    const T a10 = *(pixel_ptr + stride);
+    const T a11 = *(pixel_ptr + stride + 1);
+
+    const float w1 = dx1 * dy1;
+    const float w2 = dx * dy1;
+    const float w3 = dx1 * dy;
+    const float w4 = dx * dy;
+
+    return static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
+}
+
+/** Computes bilinear interpolation for quantized input and output, using the pointer to the top-left pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be QASYMM8 and in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
+ * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
+ * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
+ * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
+ * @param[in] iq_info   Input QuantizationInfo
+ * @param[in] oq_info   Output QuantizationInfo
+ *
+ * @note dx and dy must be in the range [0, 1.0]
+ *
+ * @return The bilinear interpolated pixel value
+ */
+inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy,
+                                           UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dx1 = 1.0f - dx;
+    const float dy1 = 1.0f - dy;
+
+    const float a00 = dequantize_qasymm8(*pixel_ptr, iq_info);
+    const float a01 = dequantize_qasymm8(*(pixel_ptr + 1), iq_info);
+    const float a10 = dequantize_qasymm8(*(pixel_ptr + stride), iq_info);
+    const float a11 = dequantize_qasymm8(*(pixel_ptr + stride + 1), iq_info);
+
+    const float w1  = dx1 * dy1;
+    const float w2  = dx * dy1;
+    const float w3  = dx1 * dy;
+    const float w4  = dx * dy;
+    float       res = a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
+    return static_cast<uint8_t>(quantize_qasymm8(res, oq_info));
+}
+
+/** Computes bilinear interpolation for quantized input and output, using the pointer to the top-left pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be QASYMM8_SIGNED and in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
+ * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
+ * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
+ * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
+ * @param[in] iq_info   Input QuantizationInfo
+ * @param[in] oq_info   Output QuantizationInfo
+ *
+ * @note dx and dy must be in the range [0, 1.0]
+ *
+ * @return The bilinear interpolated pixel value
+ */
+inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride, float dx, float dy,
+                                          UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dx1 = 1.0f - dx;
+    const float dy1 = 1.0f - dy;
+
+    const float a00 = dequantize_qasymm8_signed(*pixel_ptr, iq_info);
+    const float a01 = dequantize_qasymm8_signed(*(pixel_ptr + 1), iq_info);
+    const float a10 = dequantize_qasymm8_signed(*(pixel_ptr + stride), iq_info);
+    const float a11 = dequantize_qasymm8_signed(*(pixel_ptr + stride + 1), iq_info);
+
+    const float w1  = dx1 * dy1;
+    const float w2  = dx * dy1;
+    const float w3  = dx1 * dy;
+    const float w4  = dx * dy;
+    float       res = a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
+    return static_cast<int8_t>(quantize_qasymm8_signed(res, oq_info));
+}
+
+/** Computes linear interpolation using the pointer to the top pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the top pixel value of a single channel input.
+ * @param[in] stride    Stride to access the bottom pixel value
+ * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
+ *
+ * @note dy must be in the range [0, 1.0]
+ *
+ * @return The linear interpolated pixel value
+ */
+template <typename T>
+inline T delta_linear_c1_y(const T *pixel_ptr, size_t stride, float dy)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dy1 = 1.0f - dy;
+
+    const T a00 = *pixel_ptr;
+    const T a10 = *(pixel_ptr + stride);
+
+    const float w1 = dy1;
+    const float w3 = dy;
+
+    return static_cast<T>(a00 * w1 + a10 * w3);
+}
+
+/** Computes linear interpolation using the pointer to the left pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the left pixel value of a single channel input.
+ * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
+ *
+ * @note dx must be in the range [0, 1.0]
+ *
+ * @return The linear interpolated pixel value
+ */
+template <typename T>
+inline T delta_linear_c1_x(const T *pixel_ptr, float dx)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const T a00 = *pixel_ptr;
+    const T a01 = *(pixel_ptr + 1);
+
+    const float dx1 = 1.0f - dx;
+
+    const float w1 = dx1;
+    const float w2 = dx;
+
+    return static_cast<T>(a00 * w1 + a01 * w2);
+}
+
+/** Return the pixel at (x,y) using bilinear interpolation.
+ *
+ * @warning Only works if the iterator was created with an IImage
+ *
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel input.
+ * @param[in] stride          Stride in bytes of the image;
+ * @param[in] x               X position of the wanted pixel
+ * @param[in] y               Y position of the wanted pixel
+ *
+ * @return The pixel at (x, y) using bilinear interpolation.
+ */
+template <typename T>
+inline T pixel_bilinear_c1(const T *first_pixel_ptr, size_t stride, float x, float y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    const int32_t xi = std::floor(x);
+    const int32_t yi = std::floor(y);
+
+    const float dx = x - xi;
+    const float dy = y - yi;
+
+    return delta_bilinear_c1(first_pixel_ptr + xi + yi * stride, stride, dx, dy);
+}
+
+/** Return the pixel at (x,y) using bilinear interpolation by clamping when out of borders. The image must be single channel input
+ *
+ * @warning Only works if the iterator was created with an IImage
+ *
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel image.
+ * @param[in] stride          Stride in bytes of the image
+ * @param[in] width           Width of the image
+ * @param[in] height          Height of the image
+ * @param[in] x               X position of the wanted pixel
+ * @param[in] y               Y position of the wanted pixel
+ *
+ * @return The pixel at (x, y) using bilinear interpolation.
+ */
+template <typename T>
+inline uint8_t
+pixel_bilinear_c1_clamp(const T *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
+    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
+
+    const float xi = std::floor(x);
+    const float yi = std::floor(y);
+
+    const float dx = x - xi;
+    const float dy = y - yi;
+
+    if(dx == 0.0f)
+    {
+        if(dy == 0.0f)
+        {
+            return static_cast<T>(first_pixel_ptr[static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride]);
+        }
+        return delta_linear_c1_y(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride,
+                                 stride, dy);
+    }
+    if(dy == 0.0f)
+    {
+        return delta_linear_c1_x(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride,
+                                 dx);
+    }
+    return delta_bilinear_c1(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride,
+                             dx, dy);
+}
+
+/** Return the pixel at (x,y) using area interpolation by clamping when out of borders. The image must be single channel U8
+ *
+ * @note The interpolation area depends on the width and height ration of the input and output images
+ * @note Currently average of the contributing pixels is calculated
+ *
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
+ * @param[in] stride          Stride in bytes of the image
+ * @param[in] width           Width of the image
+ * @param[in] height          Height of the image
+ * @param[in] wr              Width ratio among the input image width and output image width.
+ * @param[in] hr              Height ratio among the input image height and output image height.
+ * @param[in] x               X position of the wanted pixel
+ * @param[in] y               Y position of the wanted pixel
+ *
+ * @return The pixel at (x, y) using area interpolation.
+ */
+inline uint8_t
+pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr,
+                      float hr, int x, int y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    // Calculate sampling position
+    float in_x = (x + 0.5f) * wr - 0.5f;
+    float in_y = (y + 0.5f) * hr - 0.5f;
+
+    // Get bounding box offsets
+    int x_from = std::floor(x * wr - 0.5f - in_x);
+    int y_from = std::floor(y * hr - 0.5f - in_y);
+    int x_to   = std::ceil((x + 1) * wr - 0.5f - in_x);
+    int y_to   = std::ceil((y + 1) * hr - 0.5f - in_y);
+
+    // Clamp position to borders
+    in_x = std::max(-1.f, std::min(in_x, static_cast<float>(width)));
+    in_y = std::max(-1.f, std::min(in_y, static_cast<float>(height)));
+
+    // Clamp bounding box offsets to borders
+    x_from = ((in_x + x_from) < -1) ? -1 : x_from;
+    y_from = ((in_y + y_from) < -1) ? -1 : y_from;
+    x_to   = ((in_x + x_to) > width) ? (width - in_x) : x_to;
+    y_to   = ((in_y + y_to) > height) ? (height - in_y) : y_to;
+
+    // Get pixel index
+    const int xi = std::floor(in_x);
+    const int yi = std::floor(in_y);
+
+    // Bounding box elements in each dimension
+    const int x_elements = (x_to - x_from + 1);
+    const int y_elements = (y_to - y_from + 1);
+    ARM_COMPUTE_ERROR_ON(x_elements == 0 || y_elements == 0);
+
+    // Sum pixels in area
+    int sum = 0;
+    for(int j = yi + y_from, je = yi + y_to; j <= je; ++j)
+    {
+        const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from;
+        sum                = std::accumulate(ptr, ptr + x_elements, sum);
+    }
+
+    // Return average
+    return sum / (x_elements * y_elements);
+}
+} // namespace scale_helpers
+} // namespace arm_compute
+
+#endif /* SRC_CORE_HELPERS_SCALEHELPERS_H */
diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp b/src/core/helpers/SoftmaxHelpers.cpp
similarity index 63%
rename from src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
rename to src/core/helpers/SoftmaxHelpers.cpp
index b0cafae520..71b971af31 100644
--- a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
+++ b/src/core/helpers/SoftmaxHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+* Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,26 +21,25 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
+#include "src/core/helpers/SoftmaxHelpers.h"
 
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NESimpleAssemblyFunction::NESimpleAssemblyFunction() // NOLINT
-    : _kernel()
+namespace arm_compute
 {
-}
-
-void NESimpleAssemblyFunction::run()
+namespace softmax_helpers
 {
-    NEScheduler::get().schedule(_kernel.get(), Window::DimX);
-}
-
-void NESimpleAssemblyFunction::configure(std::unique_ptr<INEGEMMWrapperKernel> kernel)
+PermutationVector get_permutation_vector_from_softmax_axis(size_t axis)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(kernel.get());
-    _kernel = std::move(kernel);
-    ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(_kernel->window(), 1);
+    switch(axis)
+    {
+        case 1:
+            return PermutationVector(1U, 0U, 2U, 3U);
+        case 2:
+            return PermutationVector(2U, 1U, 0U, 3U);
+        case 3:
+            return PermutationVector(3U, 1U, 2U, 0U);
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
 }
+} // namespace softmax_helpers
+} // namespace arm_compute
diff --git a/src/core/helpers/SoftmaxHelpers.h b/src/core/helpers/SoftmaxHelpers.h
new file mode 100644
index 0000000000..de5490a14d
--- /dev/null
+++ b/src/core/helpers/SoftmaxHelpers.h
@@ -0,0 +1,50 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_SOFTMAXHELPERS_H
+#define SRC_CORE_HELPERS_SOFTMAXHELPERS_H
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace softmax_helpers
+{
+/** Given a softmax axis, this function returns the permutation vector required to put the axis to the front
+ *
+ * @note This function assumes a tensor rank <= 4
+ *
+ * Axis selects the dimension on which softmax is performed.
+ * E.g. For input of shape 4x5x6 and axis=1, softmax will be applied to 4x6=24 vectors of size 5.
+ * Interally softmax kernels is always performed on the first dimension (front dimension), therefore permutation is
+ * required to put the dimension specified by @p axis to the first dimension.
+ *
+ * @param[in] axis Axis on which to perform softmax. Supported: 1, 2, 3 (0 implies no permutation needed)
+ *
+ * @return the permutation vector
+ */
+PermutationVector get_permutation_vector_from_softmax_axis(size_t axis);
+} // namespace softmax_helpers
+} // namespace arm_compute
+
+#endif /* SRC_CORE_HELPERS_SOFTMAXHELPERS_H */
diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h
new file mode 100644
index 0000000000..3c3b2b93f9
--- /dev/null
+++ b/src/core/helpers/Utils.h
@@ -0,0 +1,97 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_UTILS_H
+#define SRC_CORE_HELPERS_UTILS_H
+
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+/** Create a strides object based on the provided strides and the tensor dimensions.
+ *
+ * @param[in] info          Tensor info object providing the shape of the tensor for unspecified strides.
+ * @param[in] stride_x      Stride to be used in X dimension (in bytes).
+ * @param[in] fixed_strides Strides to be used in higher dimensions starting at Y (in bytes).
+ *
+ * @return Strides object based on the specified strides. Missing strides are
+ *         calculated based on the tensor shape and the strides of lower dimensions.
+ */
+template <typename T, typename... Ts>
+inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
+{
+    const TensorShape &shape = info.tensor_shape();
+
+    // Create strides object
+    Strides strides(stride_x, fixed_strides...);
+
+    for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
+    {
+        strides.set(i, shape[i - 1] * strides[i - 1]);
+    }
+
+    return strides;
+}
+
+/** Create a strides object based on the tensor dimensions.
+ *
+ * @param[in] info Tensor info object used to compute the strides.
+ *
+ * @return Strides object based on element size and tensor shape.
+ */
+template <typename... Ts>
+inline Strides compute_strides(const ITensorInfo &info)
+{
+    return compute_strides(info, info.element_size());
+}
+
+/** Given an integer value, this function returns the next power of two
+ *
+ * @param[in] x Input value
+ *
+ * @return the next power of two
+ */
+inline unsigned int get_next_power_two(unsigned int x)
+{
+    // Decrement by 1
+    x--;
+
+    // Shift right by 1
+    x |= x >> 1u;
+    // Shift right by 2
+    x |= x >> 2u;
+    // Shift right by 4
+    x |= x >> 4u;
+    // Shift right by 8
+    x |= x >> 8u;
+    // Shift right by 16
+    x |= x >> 16u;
+
+    // Increment by 1
+    x++;
+
+    return x;
+}
+} // namespace arm_compute
+
+#endif /* SRC_CORE_HELPERS_UTILS_H */
diff --git a/src/core/helpers/WindowHelpers.cpp b/src/core/helpers/WindowHelpers.cpp
new file mode 100644
index 0000000000..ba10eb9775
--- /dev/null
+++ b/src/core/helpers/WindowHelpers.cpp
@@ -0,0 +1,183 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+{
+    if(!skip_border)
+    {
+        border_size = BorderSize(0);
+    }
+
+    const Coordinates &anchor = valid_region.anchor;
+    const TensorShape &shape  = valid_region.shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // Skip the border left of the image
+                   anchor[0] + border_size.left,
+                   // Skip the border right of the image
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
+                   steps[0]));
+
+    size_t n = 1;
+
+    if(anchor.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Skip the border above the image
+                       anchor[1] + border_size.top,
+                       // Skip the border below the image
+                       anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
+                       steps[1]));
+
+        ++n;
+    }
+
+    if(anchor.num_dimensions() > 2)
+    {
+        window.set(2, Window::Dimension(anchor[2], std::max<size_t>(1, shape[2]), steps[2]));
+
+        ++n;
+    }
+
+    for(; n < anchor.num_dimensions(); ++n)
+    {
+        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, 1));
+    }
+
+    return window;
+}
+
+Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps, BorderSize border_size)
+{
+    const Coordinates &anchor = valid_region.anchor;
+    const TensorShape &shape  = valid_region.shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // move the anchor to the start from the border
+                   anchor[0] - border_size.left,
+                   // move the anchor to include the right end border
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
+                   steps[0]));
+
+    size_t n = 1;
+
+    if(anchor.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Include the border above the image
+                       anchor[1] - border_size.top,
+                       // Include the border below the image
+                       anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
+                       steps[1]));
+
+        ++n;
+    }
+
+    if(anchor.num_dimensions() > 2)
+    {
+        window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[n]), steps[2]));
+
+        ++n;
+    }
+
+    for(; n < anchor.num_dimensions(); ++n)
+    {
+        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, 1));
+    }
+
+    return window;
+}
+
+Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+{
+    if(skip_border)
+    {
+        border_size.top    = 0;
+        border_size.bottom = 0;
+    }
+    else
+    {
+        border_size.left  = 0;
+        border_size.right = 0;
+    }
+
+    const Coordinates &anchor = valid_region.anchor;
+    const TensorShape &shape  = valid_region.shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // Skip the border left of the image
+                   anchor[0] + border_size.left,
+                   // Skip the border right of the image
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
+                   steps[0]));
+
+    size_t n = 1;
+
+    if(anchor.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Skip the border above the image
+                       anchor[1] - border_size.top,
+                       // Skip the border below the image
+                       anchor[1] + shape[1] + border_size.bottom,
+                       1));
+
+        ++n;
+    }
+
+    for(; n < anchor.num_dimensions(); ++n)
+    {
+        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, 1));
+    }
+
+    return window;
+}
+} // namespace arm_compute
diff --git a/src/core/helpers/WindowHelpers.h b/src/core/helpers/WindowHelpers.h
new file mode 100644
index 0000000000..9bc2135b6d
--- /dev/null
+++ b/src/core/helpers/WindowHelpers.h
@@ -0,0 +1,172 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_HELPERS_WINDOWHELPERS_H
+#define SRC_CORE_HELPERS_WINDOWHELPERS_H
+
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+/** Update window and padding size for each of the access patterns.
+ *
+ * First the window size is reduced based on all access patterns that are not
+ * allowed to modify the padding of the underlying tensor. Then the padding of
+ * the remaining tensors is increased to match the window.
+ *
+ * @param[in] win      Window that is used by the kernel.
+ * @param[in] patterns Access patterns used to calculate the final window and padding.
+ *
+ * @return True if the window has been changed. Changes to the padding do not
+ *         influence the returned value.
+ */
+template <typename... Ts>
+bool update_window_and_padding(Window &win, Ts &&... patterns)
+{
+    bool window_changed = false;
+
+    utility::for_each([&](const IAccessWindow & w)
+    {
+        window_changed |= w.update_window_if_needed(win);
+    },
+    patterns...);
+
+    bool padding_changed = false;
+
+    utility::for_each([&](IAccessWindow & w)
+    {
+        padding_changed |= w.update_padding_if_needed(win);
+    },
+    patterns...);
+
+    return window_changed;
+}
+
+/** Intersect multiple valid regions.
+ *
+ * @param[in] regions Valid regions.
+ *
+ * @return Intersection of all regions.
+ */
+template <typename... Ts>
+ValidRegion intersect_valid_regions(const Ts &... regions)
+{
+    auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion
+    {
+        ValidRegion region;
+
+        for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
+        {
+            region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d]));
+        }
+
+        for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
+        {
+            region.shape.set(d, std::min(r1.shape[d], r2.shape[d]));
+        }
+
+        return region;
+    };
+
+    return utility::foldl(intersect, regions...);
+}
+
+#ifndef DOXYGEN_SKIP_THIS
+/** Calculate the maximum window for a given tensor shape and border setting
+ *
+ * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
+ * @param[in] steps        (Optional) Number of elements processed for each step.
+ * @param[in] skip_border  (Optional) If true exclude the border region from the window.
+ * @param[in] border_size  (Optional) Border size.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+
+/** Calculate the maximum window for a given tensor shape and border setting
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] skip_border (Optional) If true exclude the border region from the window.
+ * @param[in] border_size (Optional) Border size.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+{
+    return calculate_max_window(info.valid_region(), steps, skip_border, border_size);
+}
+
+/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
+ *
+ * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
+ * @param[in] steps        (Optional) Number of elements processed for each step.
+ * @param[in] skip_border  (Optional) If true exclude the border region from the window.
+ * @param[in] border_size  (Optional) Border size. The border region will be excluded from the window.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+
+/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] skip_border (Optional) If true exclude the border region from the window.
+ * @param[in] border_size (Optional) Border size.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+{
+    return calculate_max_window_horizontal(info.valid_region(), steps, skip_border, border_size);
+}
+
+/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
+ *
+ * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
+ * @param[in] steps        (Optional) Number of elements processed for each step.
+ * @param[in] border_size  (Optional) Border size. The border region will be included in the window.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
+
+/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] border_size (Optional) Border size. The border region will be included in the window.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize())
+{
+    return calculate_max_enlarged_window(info.valid_region(), steps, border_size);
+}
+#endif /* DOXYGEN_SKIP_THIS */
+} // namespace arm_compute
+
+#endif /* SRC_CORE_HELPERS_WINDOWHELPERS_H */
diff --git a/arm_compute/core/utils/helpers/bit_ops.h b/src/core/utils/helpers/bit_ops.h
similarity index 95%
rename from arm_compute/core/utils/helpers/bit_ops.h
rename to src/core/utils/helpers/bit_ops.h
index eee360c9e3..ef60214c9f 100644
--- a/arm_compute/core/utils/helpers/bit_ops.h
+++ b/src/core/utils/helpers/bit_ops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H
 #define ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H
 
-#include "arm_compute/core/utils/misc/Requires.h"
+#include "support/Requires.h"
 
 #include <type_traits>
 
diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp
index 4c2f8fa494..64633c643d 100644
--- a/src/core/utils/helpers/fft.cpp
+++ b/src/core/utils/helpers/fft.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/utils/helpers/fft.h"
+#include "src/core/utils/helpers/fft.h"
 
 #include <numeric>
 
diff --git a/arm_compute/core/utils/helpers/fft.h b/src/core/utils/helpers/fft.h
similarity index 98%
rename from arm_compute/core/utils/helpers/fft.h
rename to src/core/utils/helpers/fft.h
index 7d111b764b..f7b99dd7b8 100644
--- a/arm_compute/core/utils/helpers/fft.h
+++ b/src/core/utils/helpers/fft.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/helpers/float_ops.h b/src/core/utils/helpers/float_ops.h
similarity index 98%
rename from arm_compute/core/utils/helpers/float_ops.h
rename to src/core/utils/helpers/float_ops.h
index 1a08fc76b4..a475a23b59 100644
--- a/arm_compute/core/utils/helpers/float_ops.h
+++ b/src/core/utils/helpers/float_ops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/helpers/tensor_info.h b/src/core/utils/helpers/tensor_info.h
similarity index 98%
rename from arm_compute/core/utils/helpers/tensor_info.h
rename to src/core/utils/helpers/tensor_info.h
index 443234064a..9279532e2a 100644
--- a/arm_compute/core/utils/helpers/tensor_info.h
+++ b/src/core/utils/helpers/tensor_info.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
index 84302ea19f..f2216995a9 100644
--- a/src/core/utils/helpers/tensor_transform.cpp
+++ b/src/core/utils/helpers/tensor_transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 
-#include "arm_compute/core/utils/helpers/bit_ops.h"
+#include "bit_ops.h"
 
 namespace arm_compute
 {
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index ad6f200d36..af75eacc02 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,9 +51,9 @@ bool Graph::remove_node(NodeID nid)
 
         // Remove output connections
         std::set<EdgeID> output_edges_copy = node->output_edges();
-        for(auto &outpud_eid : output_edges_copy)
+        for(auto &output_eid : output_edges_copy)
         {
-            remove_connection(outpud_eid);
+            remove_connection(output_eid);
         }
 
         // Remove nid from tagged nodes
diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index 72c45fae30..2f74f065d5 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp
@@ -139,6 +139,12 @@ NodeID GraphBuilder::add_activation_node(Graph &g, NodeParams params, NodeIdxPai
     return create_simple_single_input_output_node<ActivationLayerNode>(g, params, input, act_info, out_quant_info);
 }
 
+NodeID GraphBuilder::add_arg_min_max_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, unsigned int axis,
+                                          DataType out_data_type, const QuantizationInfo &out_quant_info)
+{
+    return create_simple_single_input_output_node<ArgMinMaxLayerNode>(g, params, input, op, axis, out_data_type, out_quant_info);
+}
+
 NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, float epsilon,
                                                   ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr var_accessor,
                                                   ITensorAccessorUPtr beta_accessor, ITensorAccessorUPtr gamma_accessor)
@@ -376,6 +382,12 @@ NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params,
 
     return conv_nid;
 }
+
+NodeID GraphBuilder::add_depth_to_space_node(Graph &g, NodeParams params, NodeIdxPair input, int32_t block_shape)
+{
+    return create_simple_single_input_output_node<DepthToSpaceLayerNode>(g, params, input, block_shape);
+}
+
 NodeID GraphBuilder::add_dequantization_node(Graph &g, NodeParams params, NodeIdxPair input)
 {
     return create_simple_single_input_output_node<DequantizationLayerNode>(g, params, input);
@@ -540,6 +552,11 @@ NodeID GraphBuilder::add_generate_proposals_node(Graph &g, NodeParams params, No
     return nid;
 }
 
+NodeID GraphBuilder::add_l2_normalize_node(Graph &g, NodeParams params, NodeIdxPair input, int axis, float epsilon)
+{
+    return create_simple_single_input_output_node<L2NormalizeLayerNode>(g, params, input, axis, epsilon);
+}
+
 NodeID GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info)
 {
     return create_simple_single_input_output_node<NormalizationLayerNode>(g, params, input, norm_info);
@@ -625,6 +642,11 @@ NodeID GraphBuilder::add_quantization_node(Graph &g, NodeParams params, NodeIdxP
     return create_simple_single_input_output_node<QuantizationLayerNode>(g, params, input, out_quant_info);
 }
 
+NodeID GraphBuilder::add_reduction_operation_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims)
+{
+    return create_simple_single_input_output_node<ReductionLayerNode>(g, params, input, op, axis, keep_dims);
+}
+
 NodeID GraphBuilder::add_reorg_node(Graph &g, NodeParams params, NodeIdxPair input, int stride)
 {
     return create_simple_single_input_output_node<ReorgLayerNode>(g, params, input, stride);
@@ -700,6 +722,11 @@ NodeID GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair inp
     return create_simple_single_input_output_node<SplitLayerNode>(g, params, input, num_splits, axis);
 }
 
+NodeID GraphBuilder::add_strided_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends, BiStrides &strides, StridedSliceLayerInfo info)
+{
+    return create_simple_single_input_output_node<StridedSliceLayerNode>(g, params, input, starts, ends, strides, info);
+}
+
 NodeID GraphBuilder::add_stack_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, int axis)
 {
     return create_simple_multiple_input_single_output_node<StackLayerNode>(g, params, inputs, inputs.size(), axis);
diff --git a/src/graph/TypeLoader.cpp b/src/graph/TypeLoader.cpp
index a1b3fd899c..7082d6b99e 100644
--- a/src/graph/TypeLoader.cpp
+++ b/src/graph/TypeLoader.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,30 +29,6 @@
 
 namespace arm_compute
 {
-arm_compute::DataType data_type_from_name(const std::string &name)
-{
-    static const std::map<std::string, arm_compute::DataType> data_types =
-    {
-        { "f16", DataType::F16 },
-        { "f32", DataType::F32 },
-        { "qasymm8", DataType::QASYMM8 },
-    };
-
-#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
-    try
-    {
-#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
-        return data_types.at(arm_compute::utility::tolower(name));
-
-#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
-    }
-    catch(const std::out_of_range &)
-    {
-        throw std::invalid_argument(name);
-    }
-#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
-}
-
 arm_compute::DataLayout data_layout_from_name(const std::string &name)
 {
     static const std::map<std::string, arm_compute::DataLayout> data_layouts =
diff --git a/src/graph/algorithms/TopologicalSort.cpp b/src/graph/algorithms/TopologicalSort.cpp
index 3647e13e92..3a69352471 100644
--- a/src/graph/algorithms/TopologicalSort.cpp
+++ b/src/graph/algorithms/TopologicalSort.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 
 #include "arm_compute/graph/Graph.h"
 
-#include "arm_compute/core/utils/misc/Iterable.h"
+#include "support/Iterable.h"
 
 #include <list>
 #include <stack>
@@ -185,4 +185,4 @@ std::vector<NodeID> dfs(Graph &g)
     return dfs_order_vector;
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index d41da4bf7f..98013b9e49 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -23,12 +23,13 @@
  */
 #include "arm_compute/graph/backends/CL/CLFunctionFactory.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
+#include "src/core/CL/CLKernels.h"
+#include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
 
@@ -64,6 +65,7 @@ struct CLEltwiseFunctions
     using Addition       = CLArithmeticAddition;
     using Subtraction    = CLArithmeticSubtraction;
     using Multiplication = CLPixelWiseMultiplication;
+    using Maximum        = CLElementwiseMax;
 };
 
 /** Collection of CL unary element-wise functions */
@@ -237,6 +239,8 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
     {
         case NodeType::ActivationLayer:
             return detail::create_activation_layer<CLActivationLayer, CLTargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
+        case NodeType::ArgMinMaxLayer:
+            return detail::create_arg_min_max_layer<CLArgMinMaxLayer, CLTargetInfo>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BatchNormalizationLayer:
             return detail::create_batch_normalization_layer<CLBatchNormalizationLayer, CLTargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
         case NodeType::BoundingBoxTransformLayer:
@@ -249,6 +253,8 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
             return detail::create_deconvolution_layer<CLDeconvolutionLayer, CLTargetInfo>(*polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
         case NodeType::ConcatenateLayer:
             return detail::create_concatenate_layer<CLConcatenateLayer, CLTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
+        case NodeType::DepthToSpaceLayer:
+            return detail::create_depth_to_space_layer<CLDepthToSpaceLayer, CLTargetInfo>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return detail::create_depthwise_convolution_layer<CLDepthwiseConvolutionLayer, CLTargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
@@ -271,6 +277,8 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
             return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::GenerateProposalsLayer:
             return detail::create_generate_proposals_layer<CLGenerateProposalsLayer, CLTargetInfo>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node), ctx);
+        case NodeType::L2NormalizeLayer:
+            return detail::create_l2_normalize_layer<CLL2NormalizeLayer, CLTargetInfo>(*polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
         case NodeType::NormalizationLayer:
             return detail::create_normalization_layer<CLNormalizationLayer, CLTargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
         case NodeType::NormalizePlanarYUVLayer:
@@ -289,6 +297,8 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
             return detail::create_priorbox_layer<CLPriorBoxLayer, CLTargetInfo>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
             return detail::create_quantization_layer<CLQuantizationLayer, CLTargetInfo>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+        case NodeType::ReductionOperationLayer:
+            return detail::create_reduction_operation_layer<CLReductionOperation, CLTargetInfo>(*polymorphic_downcast<ReductionLayerNode *>(node), ctx);
         case NodeType::ReorgLayer:
             return detail::create_reorg_layer<CLReorgLayer, CLTargetInfo>(*polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
@@ -303,6 +313,8 @@ std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &
             return detail::create_softmax_layer<CLSoftmaxLayer, CLTargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
         case NodeType::StackLayer:
             return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
+        case NodeType::StridedSliceLayer:
+            return detail::create_strided_slice_layer<CLStridedSlice, CLTargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
         case NodeType::UpsampleLayer:
             return detail::create_upsample_layer<CLUpsampleLayer, CLTargetInfo>(*polymorphic_downcast<UpsampleLayerNode *>(node), ctx);
         case NodeType::YOLOLayer:
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index cc618db127..830f54ce3f 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -26,9 +26,20 @@
 #include "arm_compute/graph/backends/ValidateHelpers.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
 
@@ -44,6 +55,7 @@ struct CLEltwiseLayerFunctions
     using ArithmeticAddition      = CLArithmeticAddition;
     using ArithmeticSubtraction   = CLArithmeticSubtraction;
     using PixelWiseMultiplication = CLPixelWiseMultiplication;
+    using ElementwiseMax          = CLElementwiseMax;
 };
 
 /** Collection of CL unary element-wise functions */
@@ -62,6 +74,8 @@ Status CLNodeValidator::validate(INode *node)
     NodeType type = node->type();
     switch(type)
     {
+        case NodeType::ArgMinMaxLayer:
+            return detail::validate_arg_min_max_layer<CLArgMinMaxLayer>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BoundingBoxTransformLayer:
             return detail::validate_bounding_box_transform_layer<CLBoundingBoxTransform>(*polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
@@ -71,6 +85,8 @@ Status CLNodeValidator::validate(INode *node)
                    CLDirectConvolutionLayer,
                    CLGEMMConvolutionLayer,
                    CLWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+        case NodeType::DepthToSpaceLayer:
+            return detail::validate_depth_to_space_layer<CLDepthToSpaceLayer>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
@@ -81,6 +97,8 @@ Status CLNodeValidator::validate(INode *node)
             return detail::validate_detection_post_process_layer<CPPDetectionPostProcessLayer>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::GenerateProposalsLayer:
             return detail::validate_generate_proposals_layer<CLGenerateProposalsLayer>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node));
+        case NodeType::L2NormalizeLayer:
+            return detail::validate_l2_normalize_layer<CLL2NormalizeLayer>(*polymorphic_downcast<L2NormalizeLayerNode *>(node));
         case NodeType::NormalizePlanarYUVLayer:
             return detail::validate_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
         case NodeType::PadLayer:
@@ -93,6 +111,8 @@ Status CLNodeValidator::validate(INode *node)
             return detail::validate_priorbox_layer<CLPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
             return detail::validate_quantization_layer<CLQuantizationLayer>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+        case NodeType::ReductionOperationLayer:
+            return detail::validate_reduction_operation_layer<CLReductionOperation>(*polymorphic_downcast<ReductionLayerNode *>(node));
         case NodeType::ReorgLayer:
             return detail::validate_reorg_layer<CLReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
@@ -101,6 +121,8 @@ Status CLNodeValidator::validate(INode *node)
             return detail::validate_roi_align_layer<CLROIAlignLayer>(*polymorphic_downcast<ROIAlignLayerNode *>(node));
         case NodeType::SliceLayer:
             return detail::validate_slice_layer<CLSlice>(*polymorphic_downcast<SliceLayerNode *>(node));
+        case NodeType::StridedSliceLayer:
+            return detail::validate_strided_slice_layer<CLStridedSlice>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
         case NodeType::UpsampleLayer:
             return detail::validate_upsample_layer<CLUpsampleLayer>(*polymorphic_downcast<UpsampleLayerNode *>(node));
         case NodeType::YOLOLayer:
diff --git a/src/graph/backends/CL/CLSubTensorHandle.cpp b/src/graph/backends/CL/CLSubTensorHandle.cpp
index ada0d686ed..b97d25890a 100644
--- a/src/graph/backends/CL/CLSubTensorHandle.cpp
+++ b/src/graph/backends/CL/CLSubTensorHandle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/graph/backends/CL/CLSubTensorHandle.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index 8ecb593e11..7d9d388ebe 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp
@@ -23,11 +23,11 @@
  */
 #include "arm_compute/graph/backends/GLES/GCFunctionFactory.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
+#include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
 
diff --git a/src/graph/backends/GLES/GCNodeValidator.cpp b/src/graph/backends/GLES/GCNodeValidator.cpp
index 159e51246a..13a93a2556 100644
--- a/src/graph/backends/GLES/GCNodeValidator.cpp
+++ b/src/graph/backends/GLES/GCNodeValidator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,8 @@
 #include "arm_compute/graph/backends/ValidateHelpers.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
+#include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
 
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index 4fee630192..ec06f3fa30 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/graph/backends/NEON/NEFunctionFactory.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/Logger.h"
@@ -33,6 +32,8 @@
 #include "arm_compute/graph/nodes/Nodes.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "src/core/NEON/NEKernels.h"
+#include "support/Cast.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute::utils::cast;
@@ -69,6 +70,7 @@ struct NEEltwiseFunctions
     using Addition       = NEArithmeticAddition;
     using Subtraction    = NEArithmeticSubtraction;
     using Multiplication = NEPixelWiseMultiplication;
+    using Maximum        = NEElementwiseMax;
 };
 
 /** Collection of NEON unary element-wise functions */
@@ -130,12 +132,16 @@ std::unique_ptr<IFunction> NEFunctionFactory::create(INode *node, GraphContext &
     {
         case NodeType::ActivationLayer:
             return detail::create_activation_layer<NEActivationLayer, NETargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
+        case NodeType::ArgMinMaxLayer:
+            return detail::create_arg_min_max_layer<NEArgMinMaxLayer, NETargetInfo>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BatchNormalizationLayer:
             return detail::create_batch_normalization_layer<NEBatchNormalizationLayer, NETargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
             return detail::create_channel_shuffle_layer<NEChannelShuffleLayer, NETargetInfo>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
             return detail::create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+        case NodeType::DepthToSpaceLayer:
+            return detail::create_depth_to_space_layer<NEDepthToSpaceLayer, NETargetInfo>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DeconvolutionLayer:
             return detail::create_deconvolution_layer<NEDeconvolutionLayer, NETargetInfo>(*polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
         case NodeType::ConcatenateLayer:
@@ -160,6 +166,8 @@ std::unique_ptr<IFunction> NEFunctionFactory::create(INode *node, GraphContext &
             return detail::create_fused_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
             return detail::create_fused_depthwise_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
+        case NodeType::L2NormalizeLayer:
+            return detail::create_l2_normalize_layer<NEL2NormalizeLayer, NETargetInfo>(*polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
         case NodeType::NormalizationLayer:
             return detail::create_normalization_layer<NENormalizationLayer, NETargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
         case NodeType::PadLayer:
@@ -176,6 +184,8 @@ std::unique_ptr<IFunction> NEFunctionFactory::create(INode *node, GraphContext &
             return detail::create_priorbox_layer<NEPriorBoxLayer, NETargetInfo>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
             return detail::create_quantization_layer<NEQuantizationLayer, NETargetInfo>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+        case NodeType::ReductionOperationLayer:
+            return detail::create_reduction_operation_layer<NEReductionOperation, NETargetInfo>(*polymorphic_downcast<ReductionLayerNode *>(node), ctx);
         case NodeType::ReorgLayer:
             return detail::create_reorg_layer<NEReorgLayer, NETargetInfo>(*polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
@@ -186,6 +196,8 @@ std::unique_ptr<IFunction> NEFunctionFactory::create(INode *node, GraphContext &
             return detail::create_softmax_layer<NESoftmaxLayer, NETargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
         case NodeType::StackLayer:
             return detail::create_stack_layer<NEStackLayer, NETargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
+        case NodeType::StridedSliceLayer:
+            return detail::create_strided_slice_layer<NEStridedSlice, NETargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
         case NodeType::UpsampleLayer:
             return detail::create_upsample_layer<NEUpsampleLayer, NETargetInfo>(*polymorphic_downcast<UpsampleLayerNode *>(node), ctx);
         case NodeType::YOLOLayer:
diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index a5d22fb965..a9e5a86249 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp
@@ -26,9 +26,22 @@
 #include "arm_compute/graph/backends/ValidateHelpers.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEReshapeLayerKernel.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
 
@@ -44,6 +57,7 @@ struct NEEltwiseLayerFunctions
     using ArithmeticAddition      = NEArithmeticAddition;
     using ArithmeticSubtraction   = NEArithmeticSubtraction;
     using PixelWiseMultiplication = NEPixelWiseMultiplication;
+    using ElementwiseMax          = NEElementwiseMax;
 };
 
 /** Collection of NEON unary element-wise functions */
@@ -62,6 +76,8 @@ Status NENodeValidator::validate(INode *node)
     NodeType type = node->type();
     switch(type)
     {
+        case NodeType::ArgMinMaxLayer:
+            return detail::validate_arg_min_max_layer<NEArgMinMaxLayer>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BoundingBoxTransformLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : BoundingBoxTransformLayer");
         case NodeType::ChannelShuffleLayer:
@@ -71,6 +87,8 @@ Status NENodeValidator::validate(INode *node)
                    NEDirectConvolutionLayer,
                    NEGEMMConvolutionLayer,
                    NEWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+        case NodeType::DepthToSpaceLayer:
+            return detail::validate_depth_to_space_layer<NEDepthToSpaceLayer>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
@@ -81,6 +99,8 @@ Status NENodeValidator::validate(INode *node)
             return detail::validate_detection_post_process_layer<NEDetectionPostProcessLayer>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::GenerateProposalsLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer");
+        case NodeType::L2NormalizeLayer:
+            return detail::validate_l2_normalize_layer<NEL2NormalizeLayer>(*polymorphic_downcast<L2NormalizeLayerNode *>(node));
         case NodeType::NormalizePlanarYUVLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : NormalizePlanarYUVLayer");
         case NodeType::PadLayer:
@@ -93,6 +113,8 @@ Status NENodeValidator::validate(INode *node)
             return detail::validate_priorbox_layer<NEPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
             return detail::validate_quantization_layer<NEQuantizationLayer>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+        case NodeType::ReductionOperationLayer:
+            return detail::validate_reduction_operation_layer<NEReductionOperation>(*polymorphic_downcast<ReductionLayerNode *>(node));
         case NodeType::ReorgLayer:
             return detail::validate_reorg_layer<NEReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
@@ -101,6 +123,8 @@ Status NENodeValidator::validate(INode *node)
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ROIAlignLayer");
         case NodeType::SliceLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : SliceLayer");
+        case NodeType::StridedSliceLayer:
+            return detail::validate_strided_slice_layer<NEStridedSlice>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
         case NodeType::UpsampleLayer:
             return detail::validate_upsample_layer<NEUpsampleLayer>(*polymorphic_downcast<UpsampleLayerNode *>(node));
         case NodeType::YOLOLayer:
diff --git a/src/graph/backends/NEON/NETensorHandle.cpp b/src/graph/backends/NEON/NETensorHandle.cpp
index c8fc3f1ae2..4393156e8a 100644
--- a/src/graph/backends/NEON/NETensorHandle.cpp
+++ b/src/graph/backends/NEON/NETensorHandle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/graph/backends/NEON/NETensorHandle.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
index fd16625780..b45f453f23 100644
--- a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
+++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
@@ -33,7 +33,7 @@
 #include "arm_compute/graph/backends/BackendRegistry.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 #include <algorithm>
 #include <map>
diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index d5752a9f95..5be3706cfe 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp
@@ -196,9 +196,12 @@ void call_all_const_node_accessors(Graph &g)
 
     for(auto &node : nodes)
     {
-        if(node != nullptr && node->type() == NodeType::Const)
+        if(node != nullptr && node->type() == NodeType::Const && node->num_outputs())
         {
-            call_tensor_accessor(node->output(0));
+            if(!node->output(0)->bound_edges().empty())
+            {
+                call_tensor_accessor(node->output(0));
+            }
         }
     }
 }
diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
index fa63f5625b..963b948432 100644
--- a/src/graph/mutators/DepthConcatSubTensorMutator.cpp
+++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,8 @@
 #include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/ConcatenateLayerNode.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/Iterable.h"
+#include "support/Cast.h"
+#include "support/Iterable.h"
 
 namespace arm_compute
 {
diff --git a/src/graph/mutators/GroupedConvolutionMutator.cpp b/src/graph/mutators/GroupedConvolutionMutator.cpp
index e3d3812c1d..b7c551ce8b 100644
--- a/src/graph/mutators/GroupedConvolutionMutator.cpp
+++ b/src/graph/mutators/GroupedConvolutionMutator.cpp
@@ -30,7 +30,7 @@
 #include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 #include "support/StringSupport.h"
 
diff --git a/src/graph/mutators/NodeExecutionMethodMutator.cpp b/src/graph/mutators/NodeExecutionMethodMutator.cpp
index 48bb9f7fc0..09a3cf50c0 100644
--- a/src/graph/mutators/NodeExecutionMethodMutator.cpp
+++ b/src/graph/mutators/NodeExecutionMethodMutator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,7 @@
 #include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index afc4452202..1d47668cf2 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -30,7 +30,7 @@
 #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 #include <set>
 
@@ -300,10 +300,12 @@ IGraphMutator::MutationType NodeFusionMutator::type() const
 void NodeFusionMutator::mutate(Graph &g)
 {
     // Supported activations when fusing
-    const std::set<Activation> supported_fused_activations_conv    = { Activation::RELU, Activation::BOUNDED_RELU, Activation::LU_BOUNDED_RELU };
-    const std::set<Activation> supported_fused_activations_eltwise = { Activation::RELU, Activation::BOUNDED_RELU, Activation::LU_BOUNDED_RELU,
-                                                                       Activation::TANH, Activation::LOGISTIC
-                                                                     };
+    const std::set<Activation> supported_fused_activations = { Activation::ABS, Activation::BOUNDED_RELU, Activation::ELU,
+                                                               Activation::HARD_SWISH, Activation::IDENTITY, Activation::LEAKY_RELU,
+                                                               Activation::LINEAR, Activation::LOGISTIC, Activation::LU_BOUNDED_RELU,
+                                                               Activation::RELU, Activation::SOFT_RELU, Activation::SQRT,
+                                                               Activation::SQUARE, Activation::TANH
+                                                             };
 
     // Preconditions
     auto empty_prec = [](INode &)
@@ -328,11 +330,11 @@ void NodeFusionMutator::mutate(Graph &g)
     };
 
     // Fusion mutations
-    detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations_conv);
-    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations_conv);
-    detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations_conv);
-    detail::fuse_layer<FullyConnectedLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<FullyConnectedLayerNode>, supported_fused_activations_conv);
-    detail::fuse_layer<EltwiseLayerNode, ActivationLayerNode>(g, cl_target_prec, detail::fuse_node_with_activation<EltwiseLayerNode>, supported_fused_activations_eltwise);
+    detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
+    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
+    detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
+    detail::fuse_layer<FullyConnectedLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<FullyConnectedLayerNode>, supported_fused_activations);
+    detail::fuse_layer<EltwiseLayerNode, ActivationLayerNode>(g, cl_target_prec, detail::fuse_node_with_activation<EltwiseLayerNode>, supported_fused_activations);
     detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
     detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
 }
diff --git a/src/graph/mutators/SplitLayerSubTensorMutator.cpp b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
index 359bba47ef..2c28a1a2d1 100644
--- a/src/graph/mutators/SplitLayerSubTensorMutator.cpp
+++ b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
@@ -30,8 +30,8 @@
 #include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/SplitLayerNode.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
-#include "arm_compute/core/utils/misc/Iterable.h"
+#include "support/Cast.h"
+#include "support/Iterable.h"
 
 namespace arm_compute
 {
diff --git a/src/graph/mutators/SyntheticDataTypeMutator.cpp b/src/graph/mutators/SyntheticDataTypeMutator.cpp
index dbbebdfb2b..532c0e821b 100644
--- a/src/graph/mutators/SyntheticDataTypeMutator.cpp
+++ b/src/graph/mutators/SyntheticDataTypeMutator.cpp
@@ -29,7 +29,7 @@
 #include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 #include <set>
 
diff --git a/src/graph/nodes/ArgMinMaxLayerNode.cpp b/src/graph/nodes/ArgMinMaxLayerNode.cpp
new file mode 100644
index 0000000000..63163b9e2c
--- /dev/null
+++ b/src/graph/nodes/ArgMinMaxLayerNode.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ArgMinMaxLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ArgMinMaxLayerNode::ArgMinMaxLayerNode(ReductionOperation op, unsigned int axis, DataType out_data_type, QuantizationInfo out_quant_info)
+    : _op(op), _axis(axis), _out_data_type(out_data_type), _out_quant_info(std::move(out_quant_info))
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+ReductionOperation ArgMinMaxLayerNode::reduction_operation() const
+{
+    return _op;
+}
+
+unsigned int ArgMinMaxLayerNode::axis() const
+{
+    return _axis;
+}
+
+DataType ArgMinMaxLayerNode::out_data_type() const
+{
+    return _out_data_type;
+}
+
+bool ArgMinMaxLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor ArgMinMaxLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    TensorDescriptor output_info = src->desc();
+    if(!_out_quant_info.empty())
+    {
+        output_info.quant_info = _out_quant_info;
+    }
+
+    if(_out_data_type != DataType::UNKNOWN)
+    {
+        output_info.data_type = _out_data_type;
+    }
+
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, false);
+    output_info.set_shape(output_shape);
+
+    return output_info;
+}
+
+NodeType ArgMinMaxLayerNode::type() const
+{
+    return ArgMinMaxLayerNode::node_type;
+}
+
+void ArgMinMaxLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/DepthToSpaceLayerNode.cpp b/src/graph/nodes/DepthToSpaceLayerNode.cpp
new file mode 100644
index 0000000000..b70ac56a07
--- /dev/null
+++ b/src/graph/nodes/DepthToSpaceLayerNode.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/DepthToSpaceLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+DepthToSpaceLayerNode::DepthToSpaceLayerNode(int block_shape)
+    : _block_shape(block_shape)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+int DepthToSpaceLayerNode::block_shape() const
+{
+    return _block_shape;
+}
+
+TensorDescriptor DepthToSpaceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, int block_shape)
+{
+    using namespace arm_compute::helpers::tensor_transform;
+
+    TensorShape input_shape = input_descriptor.shape;
+    DataLayout  data_layout = input_descriptor.layout;
+
+    // Set descriptor shape
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape            = misc::shape_calculator::compute_depth_to_space_shape(input_shape, data_layout, block_shape);
+
+    return output_descriptor;
+}
+
+bool DepthToSpaceLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor DepthToSpaceLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    TensorDescriptor output_desc = src->desc();
+
+    return compute_output_descriptor(output_desc, _block_shape);
+}
+
+NodeType DepthToSpaceLayerNode::type() const
+{
+    return NodeType::DepthToSpaceLayer;
+}
+
+void DepthToSpaceLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/graph/nodes/EltwiseLayerNode.cpp b/src/graph/nodes/EltwiseLayerNode.cpp
index 3149a9afef..4426e953ee 100644
--- a/src/graph/nodes/EltwiseLayerNode.cpp
+++ b/src/graph/nodes/EltwiseLayerNode.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/graph/nodes/EltwiseLayerNode.h"
 
+#include "arm_compute/core/TensorShape.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
@@ -69,7 +70,7 @@ void EltwiseLayerNode::set_fused_activation(ActivationLayerInfo fused_activation
 
 bool EltwiseLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -83,10 +84,18 @@ TensorDescriptor EltwiseLayerNode::configure_output(size_t idx) const
 {
     ARM_COMPUTE_UNUSED(idx);
 
-    const Tensor *src = input(0);
-    ARM_COMPUTE_ERROR_ON(src == nullptr);
+    const Tensor *src1 = input(0);
+    ARM_COMPUTE_ERROR_ON(src1 == nullptr);
 
-    auto output_info = src->desc();
+    const Tensor *src2 = input(1);
+    ARM_COMPUTE_ERROR_ON(src2 == nullptr);
+
+    auto output_info = src1->desc();
+
+    TensorShape out_shape = TensorShape::broadcast_shape(src1->desc().shape, src2->desc().shape);
+    ARM_COMPUTE_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    output_info.set_shape(out_shape);
 
     if(!descriptor.out_quant_info.empty())
     {
diff --git a/src/graph/nodes/L2NormalizeLayerNode.cpp b/src/graph/nodes/L2NormalizeLayerNode.cpp
new file mode 100644
index 0000000000..0c35a335fa
--- /dev/null
+++ b/src/graph/nodes/L2NormalizeLayerNode.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/L2NormalizeLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+L2NormalizeLayerNode::L2NormalizeLayerNode()
+    : L2NormalizeLayerNode(0, 1e-12f)
+{
+}
+
+L2NormalizeLayerNode::L2NormalizeLayerNode(int axis)
+    : L2NormalizeLayerNode(axis, 1e-12f)
+{
+}
+
+L2NormalizeLayerNode::L2NormalizeLayerNode(int axis, float epsilon)
+    : _axis(axis), _epsilon(epsilon)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+bool L2NormalizeLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor L2NormalizeLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    TensorDescriptor output_desc = src->desc();
+
+    return output_desc;
+}
+
+int L2NormalizeLayerNode::axis() const
+{
+    return _axis;
+}
+
+float L2NormalizeLayerNode::epsilon() const
+{
+    return _epsilon;
+}
+
+NodeType L2NormalizeLayerNode::type() const
+{
+    return NodeType::L2NormalizeLayer;
+}
+
+void L2NormalizeLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/QuantizationLayerNode.cpp b/src/graph/nodes/QuantizationLayerNode.cpp
index db70c2c312..08e2a4d961 100644
--- a/src/graph/nodes/QuantizationLayerNode.cpp
+++ b/src/graph/nodes/QuantizationLayerNode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,8 +31,15 @@ namespace arm_compute
 namespace graph
 {
 QuantizationLayerNode::QuantizationLayerNode(QuantizationInfo out_quant_info)
-    : _out_quant_info(std::move(out_quant_info))
+    : QuantizationLayerNode(out_quant_info, DataType::QASYMM8)
 {
+}
+
+QuantizationLayerNode::QuantizationLayerNode(QuantizationInfo out_quant_info, DataType out_data_type)
+    : _out_quant_info(std::move(out_quant_info)), _out_data_type(out_data_type)
+{
+    ARM_COMPUTE_ERROR_ON(!is_data_type_quantized(out_data_type));
+
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
 }
@@ -58,7 +65,7 @@ TensorDescriptor QuantizationLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
     TensorDescriptor output_info = src->desc();
-    output_info.data_type        = DataType::QASYMM8;
+    output_info.data_type        = _out_data_type;
     output_info.quant_info       = _out_quant_info;
 
     return output_info;
diff --git a/src/graph/nodes/ReductionLayerNode.cpp b/src/graph/nodes/ReductionLayerNode.cpp
new file mode 100644
index 0000000000..0e93039894
--- /dev/null
+++ b/src/graph/nodes/ReductionLayerNode.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ReductionLayerNode.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ReductionLayerNode::ReductionLayerNode(ReductionOperation op, unsigned int axis, bool keep_dims)
+    : _op(op), _axis(axis), _keep_dims(keep_dims)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+ReductionOperation ReductionLayerNode::op() const
+{
+    return _op;
+}
+
+unsigned int ReductionLayerNode::axis() const
+{
+    return _axis;
+}
+
+bool ReductionLayerNode::keep_dims() const
+{
+    return _keep_dims;
+}
+
+bool ReductionLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor ReductionLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    TensorDescriptor output_info  = src->desc();
+    TensorShape      output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, _keep_dims);
+    output_info.set_shape(output_shape);
+
+    return output_info;
+}
+
+NodeType ReductionLayerNode::type() const
+{
+    return NodeType::ReductionOperationLayer;
+}
+
+void ReductionLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/graph/nodes/SoftmaxLayerNode.cpp b/src/graph/nodes/SoftmaxLayerNode.cpp
index fb907f4d23..031166993a 100644
--- a/src/graph/nodes/SoftmaxLayerNode.cpp
+++ b/src/graph/nodes/SoftmaxLayerNode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,7 +64,7 @@ TensorDescriptor SoftmaxLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
     TensorDescriptor out_desc = src->desc();
-    out_desc.quant_info       = QuantizationInfo(1.f / 256.f, 0);
+    out_desc.quant_info       = get_softmax_output_quantization_info(out_desc.data_type, false);
 
     return out_desc;
 }
diff --git a/src/graph/nodes/StridedSliceLayerNode.cpp b/src/graph/nodes/StridedSliceLayerNode.cpp
new file mode 100644
index 0000000000..6a1a724bb3
--- /dev/null
+++ b/src/graph/nodes/StridedSliceLayerNode.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/StridedSliceLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+StridedSliceLayerNode::StridedSliceLayerNode(const Coordinates    &starts,
+                                             const Coordinates    &ends,
+                                             const BiStrides      &strides,
+                                             StridedSliceLayerInfo info)
+    : _starts(starts), _ends(ends), _strides(strides), _info(std::move(info))
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+Coordinates StridedSliceLayerNode::starts() const
+{
+    return _starts;
+}
+
+Coordinates StridedSliceLayerNode::ends() const
+{
+    return _ends;
+}
+
+BiStrides StridedSliceLayerNode::strides() const
+{
+    return _strides;
+}
+
+StridedSliceLayerInfo StridedSliceLayerNode::strided_slice_info() const
+{
+    return _info;
+}
+
+TensorDescriptor StridedSliceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                  const Coordinates      &starts,
+                                                                  const Coordinates      &ends,
+                                                                  const BiStrides        &strides,
+                                                                  StridedSliceLayerInfo   info)
+{
+    using namespace arm_compute::helpers::tensor_transform;
+
+    TensorDescriptor output_desc = input_descriptor;
+    output_desc.shape            = compute_strided_slice_output_shape(input_descriptor.shape, starts, ends, strides,
+                                                                      info.begin_mask(), info.end_mask(), info.shrink_axis_mask());
+
+    return output_desc;
+}
+
+bool StridedSliceLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor StridedSliceLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    return compute_output_descriptor(src->desc(), _starts, _ends, _strides, _info);
+}
+
+NodeType StridedSliceLayerNode::type() const
+{
+    return NodeType::StridedSliceLayer;
+}
+
+void StridedSliceLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index adfdc3c917..5f1842f76d 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
 
 namespace
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index efbc68f50e..a1743c56e6 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/CLMemory.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/CLOperator.cpp b/src/runtime/CL/CLOperator.cpp
index 57a4d0ec57..075a544077 100644
--- a/src/runtime/CL/CLOperator.cpp
+++ b/src/runtime/CL/CLOperator.cpp
@@ -24,6 +24,8 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/ICLOperator.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 namespace experimental
diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp
index 2fc7f93adf..571e30931c 100644
--- a/src/runtime/CL/CLRuntimeContext.cpp
+++ b/src/runtime/CL/CLRuntimeContext.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 CLRuntimeContext::CLRuntimeContext()
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index ccef5cbd1b..6fc7baed63 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -24,10 +24,10 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
 #include "arm_compute/runtime/CL/tuners/Tuners.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 90d77883f6..f37fc779fe 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 const cl::Buffer CLTensorAllocator::_empty_buffer = cl::Buffer();
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index adfe67fb11..ed85e606cf 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/CLTuner.h"
 #include "arm_compute/runtime/CL/tuners/CLLWSList.h"
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/ICLKernel.h"
 #include "support/StringSupport.h"
 
 #include <cerrno>
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index b00ad5e74f..b075aa17e3 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,19 +26,24 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT
     : _kernel(),
-      _border_handler(),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>()),
       _ctx(ctx)
 {
 }
 
+ICLSimpleFunction::~ICLSimpleFunction() = default;
+
 void ICLSimpleFunction::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the CL kernel or function isn't configured");
-    schedule_kernel_on_ctx(_ctx, &_border_handler, false);
+    schedule_kernel_on_ctx(_ctx, _border_handler.get(), false);
     schedule_kernel_on_ctx(_ctx, _kernel.get());
 }
diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
index d5d1bbdd7a..b7f40a516c 100644
--- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp
+++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
 
-#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+#include "src/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
index 2f06252446..742de64e34 100644
--- a/src/runtime/CL/functions/CLAccumulate.cpp
+++ b/src/runtime/CL/functions/CLAccumulate.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLAccumulate.h"
 
-#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+#include "src/core/CL/kernels/CLAccumulateKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 5ddf227382..61c82b33eb 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
+#include "src/core/CL/kernels/CLActivationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index ad6e7ba97b..5fc849e3c5 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -24,13 +24,16 @@
 
 #include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
 
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/Utils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/Utils.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -39,6 +42,8 @@ CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manage
 {
 }
 
+CLArgMinMaxLayer::~CLArgMinMaxLayer() = default;
+
 Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -47,7 +52,7 @@ Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITen
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-    const unsigned int num_of_stages = calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+    const unsigned int num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
 
     DataType   output_data_type = DataType::S32;
     TensorInfo not_reshaped_output;
@@ -115,7 +120,7 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou
 void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _num_of_stages  = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+    _num_of_stages  = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
     _reduction_axis = axis;
 
     const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
@@ -123,13 +128,19 @@ void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
 
     // Configure reduction operation kernels
-    _reduction_kernels_vector.resize(_num_of_stages);
+    _reduction_kernels_vector.reserve(_num_of_stages);
+
+    auto add_reduction_kernel = [this, &compile_context, axis, op](const ICLTensor * input, const ICLTensor * prev_output, ICLTensor * output)
+    {
+        _reduction_kernels_vector.emplace_back(support::cpp14::make_unique<CLArgMinMaxLayerKernel>());
+        _reduction_kernels_vector.back()->configure(compile_context, input, prev_output, output, axis, op);
+    };
 
     _memory_group.manage(&_not_reshaped_output);
     // Create temporary tensors
     if(_num_of_stages == 1)
     {
-        _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_not_reshaped_output, axis, op);
+        add_reduction_kernel(input, nullptr, &_not_reshaped_output);
     }
     else
     {
@@ -143,19 +154,19 @@ void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const
 
         // Apply ReductionOperation only on first kernel
         _memory_group.manage(&_results_vector[0]);
-        _reduction_kernels_vector[0].configure(compile_context, input, nullptr, &_results_vector[0], axis, op);
+        add_reduction_kernel(input, nullptr, &_results_vector[0]);
 
         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
             _memory_group.manage(&_results_vector[i]);
-            _reduction_kernels_vector[i].configure(compile_context, input, &_results_vector[i - 1], &_results_vector[i], axis, op);
+            add_reduction_kernel(input, &_results_vector[i - 1], &_results_vector[i]);
             _results_vector[i - 1].allocator()->allocate();
         }
 
         // Apply ReductionOperation on the last stage
         const unsigned int last_stage = _num_of_stages - 1;
-        _reduction_kernels_vector[last_stage].configure(compile_context, input, &_results_vector[last_stage - 1], &_not_reshaped_output, axis, op);
+        add_reduction_kernel(input, &_results_vector[last_stage - 1], &_not_reshaped_output);
         _results_vector[last_stage - 1].allocator()->allocate();
     }
     _reshape.configure(compile_context, &_not_reshaped_output, output);
@@ -168,8 +179,8 @@ void CLArgMinMaxLayer::run()
 
     for(unsigned int i = 0; i < _num_of_stages; ++i)
     {
-        CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+        CLScheduler::get().enqueue(*_reduction_kernels_vector[i], false);
     }
     _reshape.run();
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index 701add074e..77eed1140f 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -29,14 +29,19 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
+namespace arm_compute
+{
 CLBatchNormalizationLayer::CLBatchNormalizationLayer()
-    : _norm_kernel()
+    : _norm_kernel(support::cpp14::make_unique<CLBatchNormalizationLayerKernel>())
 {
 }
 
+CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default;
+
 void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon,
                                           ActivationLayerInfo act_info)
 {
@@ -47,7 +52,7 @@ void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_contex
                                           const ICLTensor *gamma, float epsilon,
                                           ActivationLayerInfo act_info)
 {
-    _norm_kernel.configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
+    _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
 Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
@@ -60,5 +65,6 @@ Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITens
 
 void CLBatchNormalizationLayer::run()
 {
-    CLScheduler::get().enqueue(_norm_kernel, true);
+    CLScheduler::get().enqueue(*_norm_kernel, true);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
index 5ba3b5bc9c..e0a2c430ed 100644
--- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
@@ -30,13 +30,18 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-using namespace arm_compute;
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
+#include "support/MemorySupport.h"
 
+namespace arm_compute
+{
 CLBatchToSpaceLayer::CLBatchToSpaceLayer()
-    : _batch_to_space_kernel()
+    : _batch_to_space_kernel(support::cpp14::make_unique<CLBatchToSpaceLayerKernel>())
 {
 }
 
+CLBatchToSpaceLayer::~CLBatchToSpaceLayer() = default;
+
 void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
@@ -44,7 +49,7 @@ void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *blo
 
 void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
 {
-    _batch_to_space_kernel.configure(compile_context, input, block_shape, output);
+    _batch_to_space_kernel->configure(compile_context, input, block_shape, output);
 }
 
 void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
@@ -54,7 +59,7 @@ void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_
 
 void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
 {
-    _batch_to_space_kernel.configure(compile_context, input, block_shape_x, block_shape_y, output);
+    _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output);
 }
 
 Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
@@ -69,5 +74,6 @@ Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_sha
 
 void CLBatchToSpaceLayer::run()
 {
-    CLScheduler::get().enqueue(_batch_to_space_kernel, true);
+    CLScheduler::get().enqueue(*_batch_to_space_kernel, true);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index cb49e61e84..cfcd63f170 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
 
-#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+#include "src/core/CL/kernels/CLBitwiseAndKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 22c575ca8d..588c793f6a 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
 
-#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+#include "src/core/CL/kernels/CLBitwiseNotKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index 4bbb8909fe..3a5de193a3 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
 
-#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+#include "src/core/CL/kernels/CLBitwiseOrKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index bc37f6eaab..62aeaaa31f 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
 
-#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+#include "src/core/CL/kernels/CLBitwiseXorKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
index 2384fc4132..600d36290c 100644
--- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
 
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
index 0300899b59..be40f25055 100644
--- a/src/runtime/CL/functions/CLBox3x3.cpp
+++ b/src/runtime/CL/functions/CLBox3x3.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBox3x3.h"
 
-#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLBox3x3Kernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLBox3x3::configure(const CLCompileContext &compile_context, ICLTensor *inp
     auto k = arm_compute::support::cpp14::make_unique<CLBox3x3Kernel>();
     k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index cd2d6b478a..5a32564d2d 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -31,6 +31,10 @@
 #include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
 #include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
 #include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+#include "src/core/CL/kernels/CLCannyEdgeKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
@@ -38,10 +42,10 @@ using namespace arm_compute;
 CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
       _sobel(),
-      _gradient(),
-      _border_mag_gradient(),
-      _non_max_suppr(),
-      _edge_trace(),
+      _gradient(support::cpp14::make_unique<CLGradientKernel>()),
+      _border_mag_gradient(support::cpp14::make_unique<CLFillBorderKernel>()),
+      _non_max_suppr(support::cpp14::make_unique<CLEdgeNonMaxSuppressionKernel>()),
+      _edge_trace(support::cpp14::make_unique<CLEdgeTraceKernel>()),
       _gx(),
       _gy(),
       _mag(),
@@ -55,6 +59,8 @@ CLCannyEdge::CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLI
 {
 }
 
+CLCannyEdge::~CLCannyEdge() = default;
+
 void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
                             uint8_t constant_border_value)
 {
@@ -143,7 +149,7 @@ void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *
     _memory_group.manage(&_phase);
 
     // Configure gradient
-    _gradient.configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type);
+    _gradient->configure(compile_context, &_gx, &_gy, &_mag, &_phase, norm_type);
 
     // Allocate intermediate buffers
     _gx.allocator()->allocate();
@@ -153,14 +159,14 @@ void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *
     _memory_group.manage(&_nonmax);
 
     // Configure non-maxima suppression
-    _non_max_suppr.configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
+    _non_max_suppr->configure(compile_context, &_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
 
     // Allocate intermediate buffers
     _phase.allocator()->allocate();
 
     // Fill border around magnitude image as non-maxima suppression will access
     // it. If border mode is undefined filling the border is a nop.
-    _border_mag_gradient.configure(compile_context, &_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
+    _border_mag_gradient->configure(compile_context, &_mag, _non_max_suppr->border_size(), border_mode, constant_border_value);
 
     // Allocate intermediate buffers
     _mag.allocator()->allocate();
@@ -172,7 +178,7 @@ void CLCannyEdge::configure(const CLCompileContext &compile_context, ICLTensor *
     _memory_group.manage(&_l1_list_counter);
 
     // Configure edge tracing
-    _edge_trace.configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
+    _edge_trace->configure(compile_context, &_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
 
     // Allocate intermediate buffers
     _visited.allocator()->allocate();
@@ -190,14 +196,14 @@ void CLCannyEdge::run()
     _sobel->run();
 
     // Run phase and magnitude calculation
-    CLScheduler::get().enqueue(_gradient, false);
+    CLScheduler::get().enqueue(*_gradient, false);
 
     // Fill border before non-maxima suppression. Nop for border mode undefined.
-    CLScheduler::get().enqueue(_border_mag_gradient, false);
+    CLScheduler::get().enqueue(*_border_mag_gradient, false);
 
     // Run non max suppresion
     _nonmax.clear(CLScheduler::get().queue());
-    CLScheduler::get().enqueue(_non_max_suppr, false);
+    CLScheduler::get().enqueue(*_non_max_suppr, false);
 
     // Clear temporary structures and run edge trace
     _output->clear(CLScheduler::get().queue());
@@ -205,5 +211,5 @@ void CLCannyEdge::run()
     _recorded.clear(CLScheduler::get().queue());
     _l1_list_counter.clear(CLScheduler::get().queue());
     _l1_stack.clear(CLScheduler::get().queue());
-    CLScheduler::get().enqueue(_edge_trace, true);
+    CLScheduler::get().enqueue(*_edge_trace, true);
 }
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index 95cc0e9239..2a28e06845 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLCast.h"
 
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
index 326caa8c74..e93aea31f4 100644
--- a/src/runtime/CL/functions/CLChannelCombine.cpp
+++ b/src/runtime/CL/functions/CLChannelCombine.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
 
-#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+#include "src/core/CL/kernels/CLChannelCombineKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
index aa37af9988..8b4a3f7458 100644
--- a/src/runtime/CL/functions/CLChannelExtract.cpp
+++ b/src/runtime/CL/functions/CLChannelExtract.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
 
-#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+#include "src/core/CL/kernels/CLChannelExtractKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
index b79afdb3b4..c443df3b37 100644
--- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
index 2bbb30e24c..95f4257929 100644
--- a/src/runtime/CL/functions/CLColorConvert.cpp
+++ b/src/runtime/CL/functions/CLColorConvert.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLColorConvert.h"
 
-#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+#include "src/core/CL/kernels/CLColorConvertKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
index 8c18b35583..9b5840aa95 100644
--- a/src/runtime/CL/functions/CLComparison.cpp
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLComparison.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -47,7 +48,7 @@ void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor
 
         if(broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
     }
 }
@@ -76,7 +77,7 @@ void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context,
 
         if(broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
     }
 }
diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
index be86fc4f78..2cae0ee455 100644
--- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLComputeAllAnchors.h"
+#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
 
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 4214813446..54f71f9765 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -23,19 +23,20 @@
  */
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+#include "src/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index 4c787673b5..8ecc114343 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -22,6 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
+#include "src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index bc962d0052..1ad32d309c 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/CL/functions/CLConvolution.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -32,6 +31,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/kernels/CLConvolutionKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -49,15 +50,20 @@ void CLConvolution3x3::configure(const CLCompileContext &compile_context, ICLTen
     auto k = arm_compute::support::cpp14::make_unique<CLConvolution3x3Kernel>();
     k->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 template <unsigned int matrix_size>
 CLConvolutionSquare<matrix_size>::CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+    : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(support::cpp14::make_unique<CLSeparableConvolutionHorKernel<matrix_size>>()),
+      _kernel_vert(support::cpp14::make_unique<CLSeparableConvolutionVertKernel<matrix_size>>()), _kernel(support::cpp14::make_unique<CLConvolutionKernel<matrix_size>>()),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>())
 {
 }
 
+template <unsigned int matrix_size>
+CLConvolutionSquare<matrix_size>::~CLConvolutionSquare() = default;
+
 template <unsigned int matrix_size>
 void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
                                                  uint8_t constant_border_value)
@@ -88,35 +94,35 @@ void CLConvolutionSquare<matrix_size>::configure(const CLCompileContext &compile
             scale = calculate_matrix_scale(conv, matrix_size);
         }
 
-        _kernel_hor.configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
-        _kernel_vert.configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
-        _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+        _kernel_hor->configure(compile_context, input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+        _kernel_vert->configure(compile_context, &_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
+        _border_handler->configure(compile_context, input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 
         // Allocate intermediate buffer
         _tmp.allocator()->allocate();
     }
     else
     {
-        _kernel.configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-        _border_handler.configure(compile_context, input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
+        _kernel->configure(compile_context, input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+        _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
     }
 }
 
 template <unsigned int matrix_size>
 void                   CLConvolutionSquare<matrix_size>::run()
 {
-    CLScheduler::get().enqueue(_border_handler);
+    CLScheduler::get().enqueue(*_border_handler);
 
     if(_is_separable)
     {
         MemoryGroupResourceScope scope_mg(_memory_group);
 
-        CLScheduler::get().enqueue(_kernel_hor, false);
-        CLScheduler::get().enqueue(_kernel_vert);
+        CLScheduler::get().enqueue(*_kernel_hor, false);
+        CLScheduler::get().enqueue(*_kernel_vert);
     }
     else
     {
-        CLScheduler::get().enqueue(_kernel);
+        CLScheduler::get().enqueue(*_kernel);
     }
 }
 
@@ -135,5 +141,5 @@ void CLConvolutionRectangle::configure(const CLCompileContext &compile_context,
     auto k = arm_compute::support::cpp14::make_unique<CLConvolutionRectangleKernel>();
     k->configure(compile_context, input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 630352e4e6..e214bdf0f2 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/MemorySupport.h"
 
 #include <cmath>
 #include <memory>
@@ -43,6 +44,8 @@ CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_ma
 {
 }
 
+CLConvolutionLayer::~CLConvolutionLayer() = default;
+
 void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
                                    const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
 {
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index acdc52d4f7..f7b016a779 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -24,11 +24,11 @@
 #include "arm_compute/runtime/CL/functions/CLCopy.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index 529f7bfb3e..4aaa674c5c 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -25,6 +25,14 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLCropKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/MemorySupport.h"
 
 #include <cstddef>
 
@@ -57,6 +65,8 @@ CLCropResize::CLCropResize()
 {
 }
 
+CLCropResize::~CLCropResize() = default;
+
 Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
                               Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
 {
@@ -142,7 +152,7 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
         win.set(3, Window::Dimension(num_box, num_box + 1, 1));
 
         auto copy_kernel = support::cpp14::make_unique<CLCopyKernel>();
-        copy_kernel->configure(compile_context, _scaled_results[num_box].get(), _output, PaddingList(), &win);
+        copy_kernel->configure(compile_context, _scaled_results[num_box].get(), _output, &win);
         _copy.emplace_back(std::move(copy_kernel));
 
         _crop_results[num_box]->allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index cd55336d9a..6fe231ea6c 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/MemorySupport.h"
 
 #include <cmath>
 #include <memory>
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index eb1fb7fbdf..0cf2ea623f 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -27,16 +27,21 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
-    : _upsample(),
-      _memset(),
+    : _upsample(support::cpp14::make_unique<CLDeconvolutionLayerUpsampleKernel>()),
+      _memset(support::cpp14::make_unique<CLMemsetKernel>()),
       _output(nullptr)
 {
 }
 
+CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default;
+
 Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
 {
     return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info);
@@ -52,13 +57,13 @@ void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_con
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     _output = output;
-    _memset.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
-    _upsample.configure(compile_context, input, _output, info);
+    _memset->configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
+    _upsample->configure(compile_context, input, _output, info);
 }
 
 void CLDeconvolutionLayerUpsample::run()
 {
-    CLScheduler::get().enqueue(_memset, false);
-    CLScheduler::get().enqueue(_upsample, true);
+    CLScheduler::get().enqueue(*_memset, false);
+    CLScheduler::get().enqueue(*_upsample, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index 141eb3fefc..e58c0e5f4c 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index 8571056104..8dbd974ceb 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index bb0db2e7a7..2440384e3b 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -24,13 +24,19 @@
 #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -119,7 +125,7 @@ Status validate_arguments_3x3(const ITensorInfo *input, const ITensorInfo *weigh
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _dwc_native_kernel(),
+      _dwc_native_kernel(support::cpp14::make_unique<CLDepthwiseConvolutionLayerNativeKernel>()),
       _permute_input_to_nhwc(),
       _permute_weights_to_nhwc(),
       _permute_output_to_nchw(),
@@ -137,6 +143,8 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConv
 {
 }
 
+CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default;
+
 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
 {
@@ -206,9 +214,9 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
     dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
     DWCKernelInfo dwc_info;
     dwc_info.activation_info = act_info;
-    _dwc_native_kernel.configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
-                                 dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
-                                 output_multipliers_to_use, output_shifts_to_use);
+    _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
+                                  dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
+                                  output_multipliers_to_use, output_shifts_to_use);
 
     if(_needs_permute)
     {
@@ -302,7 +310,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::run()
     {
         _permute_input_to_nhwc.run();
     }
-    CLScheduler::get().enqueue(_dwc_native_kernel);
+    CLScheduler::get().enqueue(*_dwc_native_kernel);
     if(_needs_permute)
     {
         _permute_output_to_nchw.run();
@@ -343,11 +351,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare()
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
       _kernel(nullptr),
-      _border_handler(),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>()),
       _permute_input_to_nchw(),
       _permute_weights_to_nchw(),
       _permute_output_to_nhwc(),
-      _reshape_weights(),
+      _reshape_weights(support::cpp14::make_unique<CLDepthwiseConvolutionLayerReshapeWeightsKernel>()),
       _permuted_input(),
       _permuted_weights(),
       _permuted_output(),
@@ -378,14 +386,14 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayerInternal3x3::validate(input->info(),
-                                                                        weights->info(),
-                                                                        biases != nullptr ? biases->info() : nullptr,
-                                                                        output->info(),
-                                                                        conv_info,
-                                                                        depth_multiplier,
-                                                                        act_info,
-                                                                        gpu_target,
-                                                                        dilation));
+                                                                                weights->info(),
+                                                                                biases != nullptr ? biases->info() : nullptr,
+                                                                                output->info(),
+                                                                                conv_info,
+                                                                                depth_multiplier,
+                                                                                act_info,
+                                                                                gpu_target,
+                                                                                dilation));
 
     const bool is_nhwc     = input->info()->data_layout() == DataLayout::NHWC;
     _is_quantized          = is_data_type_quantized_asymmetric(input->info()->data_type());
@@ -434,7 +442,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config
     {
         if(_needs_weights_reshape)
         {
-            _reshape_weights.configure(compile_context, weights, &_permuted_weights, info);
+            _reshape_weights->configure(compile_context, weights, &_permuted_weights, info);
             weights_to_use = &_permuted_weights;
         }
         _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
@@ -486,7 +494,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::config
     {
         zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
     }
-    _border_handler.configure(compile_context, input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+    _border_handler->configure(compile_context, input_to_use, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
 }
 
 Status CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
@@ -505,7 +513,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::run()
     {
         _permute_input_to_nchw.run();
     }
-    CLScheduler::get().enqueue(_border_handler);
+    CLScheduler::get().enqueue(*_border_handler);
     CLScheduler::get().enqueue(*_kernel);
 
     if(_needs_permute)
@@ -547,7 +555,7 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepar
             ARM_COMPUTE_ERROR_ON(_needs_permute);
             ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
             _permuted_weights.allocator()->allocate();
-            CLScheduler::get().enqueue(_reshape_weights);
+            CLScheduler::get().enqueue(*_reshape_weights);
             _original_weights->mark_as_unused();
         }
         _is_prepared = true;
@@ -567,7 +575,7 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w
 
 void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
                                             const PadStrideInfo &conv_info,
-                                            unsigned int        depth_multiplier,
+                                            unsigned int         depth_multiplier,
                                             ActivationLayerInfo act_info, const Size2D &dilation)
 {
     const GPUTarget gpu_target = CLScheduler::get().target();
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 66ac58ef95..6d63463906 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLDequantizationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
index 7138281f87..a2b883ad28 100644
--- a/src/runtime/CL/functions/CLDerivative.cpp
+++ b/src/runtime/CL/functions/CLDerivative.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDerivative.h"
 
-#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLDerivativeKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLDerivative::configure(const CLCompileContext &compile_context, ICLTensor
     auto k = arm_compute::support::cpp14::make_unique<CLDerivativeKernel>();
     k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
index 27acf9f7cc..c3d5f8845f 100644
--- a/src/runtime/CL/functions/CLDilate.cpp
+++ b/src/runtime/CL/functions/CLDilate.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDilate.h"
 
-#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLDilateKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLDilate::configure(const CLCompileContext &compile_context, ICLTensor *inp
     auto k = arm_compute::support::cpp14::make_unique<CLDilateKernel>();
     k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index c1055dda36..bff882c28b 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -24,33 +24,38 @@
 #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLDirectConvolutionLayer::CLDirectConvolutionLayer()
-    : _direct_conv_kernel(), _input_border_handler(), _activationlayer_function(), _is_activationlayer_enabled(false)
+    : _direct_conv_kernel(support::cpp14::make_unique<CLDirectConvolutionLayerKernel>()), _input_border_handler(support::cpp14::make_unique<CLFillBorderKernel>()), _activationlayer_function(),
+      _is_activationlayer_enabled(false)
 {
 }
 
+CLDirectConvolutionLayer::~CLDirectConvolutionLayer() = default;
+
 void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
 }
 
 void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                         const PadStrideInfo &conv_info,
+                                         const PadStrideInfo       &conv_info,
                                          const ActivationLayerInfo &act_info)
 {
     // Set GPU target
-    _direct_conv_kernel.set_target(CLScheduler::get().target());
+    _direct_conv_kernel->set_target(CLScheduler::get().target());
 
     // Configure direct convolution
-    _direct_conv_kernel.configure(compile_context, input, weights, biases, output, conv_info);
+    _direct_conv_kernel->configure(compile_context, input, weights, biases, output, conv_info);
 
     // Configure border handler
     PixelValue &&zero_value(0.f);
@@ -58,10 +63,10 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context
     {
         zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
     }
-    _input_border_handler.configure(compile_context, input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+    _input_border_handler->configure(compile_context, input, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value);
 
     // Tune kernels
-    CLScheduler::get().tune_kernel_static(_direct_conv_kernel);
+    CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
 
     _is_activationlayer_enabled = act_info.enabled();
 
@@ -86,10 +91,10 @@ Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITenso
 void CLDirectConvolutionLayer::run()
 {
     // Run border handler
-    CLScheduler::get().enqueue(_input_border_handler, false);
+    CLScheduler::get().enqueue(*_input_border_handler, false);
 
     // Run direct convolution
-    CLScheduler::get().enqueue(_direct_conv_kernel);
+    CLScheduler::get().enqueue(*_direct_conv_kernel);
 
     //Run Activation Layer
     if(_is_activationlayer_enabled)
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 3515c25d82..0e3109439e 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -23,11 +23,18 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
 #include <tuple>
diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
index de94255b48..35ed97d381 100644
--- a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 6f664725c5..736cf973a1 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -24,51 +24,17 @@
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-namespace
-{
-void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ITensorInfo *input1, ITensorInfo *input2, const ITensorInfo *output)
-{
-    if(output->dimension(0) > 1)
-    {
-        ITensorInfo *broadcasted_info = (input1->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->dimension(0) == 1)
-        {
-            border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE);
-        }
-    }
-}
-
-ITensorPack select_border_input(ITensorPack &tensors)
-{
-    ITensorPack pack;
-    if(tensors.get_tensor(TensorType::ACL_DST)->info()->dimension(0) > 1)
-    {
-        if(tensors.get_const_tensor(TensorType::ACL_SRC_1)->info()->dimension(0) == 1)
-        {
-            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1));
-        }
-        else
-        {
-            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_0));
-        }
-    }
-    return pack;
-}
-} // namespace
-
 namespace experimental
 {
 CLArithmeticAddition::CLArithmeticAddition()
-    : _border_handler()
 {
 }
 
@@ -77,7 +43,6 @@ void CLArithmeticAddition::configure(const CLCompileContext &compile_context, IT
     auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::ADD, input1, input2, output, policy, act_info);
     _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
@@ -87,13 +52,10 @@ Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorIn
 
 void CLArithmeticAddition::run(ITensorPack &tensors)
 {
-    auto border_pack = select_border_input(tensors);
-    CLScheduler::get().enqueue_op(_border_handler, border_pack);
     ICLOperator::run(tensors);
 }
 
 CLArithmeticSubtraction::CLArithmeticSubtraction()
-    : _border_handler()
 {
 }
 void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy,
@@ -102,7 +64,6 @@ void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context,
     auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::SUB, input1, input2, output, policy, act_info);
     _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
@@ -113,13 +74,10 @@ Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITenso
 
 void CLArithmeticSubtraction::run(ITensorPack &tensors)
 {
-    auto border_pack = select_border_input(tensors);
-    CLScheduler::get().enqueue_op(_border_handler, border_pack);
     ICLOperator::run(tensors);
 }
 
 CLArithmeticDivision::CLArithmeticDivision()
-    : _border_handler()
 {
 }
 
@@ -128,7 +86,6 @@ void CLArithmeticDivision::configure(const CLCompileContext &compile_context, IT
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::DIV, input1, input2, output, act_info);
     _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
@@ -138,13 +95,10 @@ Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorIn
 
 void CLArithmeticDivision::run(ITensorPack &tensors)
 {
-    auto border_pack = select_border_input(tensors);
-    CLScheduler::get().enqueue_op(_border_handler, border_pack);
     ICLOperator::run(tensors);
 }
 
 CLElementwiseMax::CLElementwiseMax()
-    : _border_handler()
 {
 }
 
@@ -153,7 +107,6 @@ void CLElementwiseMax::configure(const CLCompileContext &compile_context, ITenso
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::MAX, input1, input2, output, act_info);
     _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
@@ -163,13 +116,10 @@ Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *
 
 void CLElementwiseMax::run(ITensorPack &tensors)
 {
-    auto border_pack = select_border_input(tensors);
-    CLScheduler::get().enqueue_op(_border_handler, border_pack);
     ICLOperator::run(tensors);
 }
 
 CLElementwiseMin::CLElementwiseMin()
-    : _border_handler()
 {
 }
 
@@ -178,7 +128,6 @@ void CLElementwiseMin::configure(const CLCompileContext &compile_context, ITenso
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::MIN, input1, input2, output, act_info);
     _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
@@ -188,13 +137,10 @@ Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *
 
 void CLElementwiseMin::run(ITensorPack &tensors)
 {
-    auto border_pack = select_border_input(tensors);
-    CLScheduler::get().enqueue_op(_border_handler, border_pack);
     ICLOperator::run(tensors);
 }
 
 CLElementwiseSquaredDiff::CLElementwiseSquaredDiff()
-    : _border_handler()
 {
 }
 
@@ -203,7 +149,6 @@ void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info);
     _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
@@ -213,13 +158,10 @@ Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITens
 
 void CLElementwiseSquaredDiff::run(ITensorPack &tensors)
 {
-    auto border_pack = select_border_input(tensors);
-    CLScheduler::get().enqueue_op(_border_handler, border_pack);
     ICLOperator::run(tensors);
 }
 
 CLElementwisePower::CLElementwisePower()
-    : _border_handler()
 {
 }
 
@@ -228,7 +170,6 @@ void CLElementwisePower::configure(const CLCompileContext &compile_context, ITen
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::POWER, input1, input2, output, act_info);
     _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input1, input2, output);
 }
 
 Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
@@ -238,8 +179,6 @@ Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo
 
 void CLElementwisePower::run(ITensorPack &tensors)
 {
-    auto border_pack = select_border_input(tensors);
-    CLScheduler::get().enqueue_op(_border_handler, border_pack);
     ICLOperator::run(tensors);
 }
 } // namespace experimental
@@ -477,7 +416,6 @@ struct CLElementwiseSquaredDiff::Impl
     const ICLTensor                                        *src_1{ nullptr };
     ICLTensor                                              *dst{ nullptr };
     std::unique_ptr<experimental::CLElementwiseSquaredDiff> op{ nullptr };
-    std::unique_ptr<CLFillBorderKernel>                     _border_handler{ nullptr };
 };
 
 CLElementwiseSquaredDiff::CLElementwiseSquaredDiff()
diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
index a1158a71a5..cc927a055b 100644
--- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp
+++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
@@ -28,6 +28,9 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLHistogramKernel.h"
+#include "src/core/CL/kernels/CLTableLookupKernel.h"
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 #include <cmath>
@@ -83,10 +86,17 @@ void calculate_cum_dist_and_lut(CLDistribution1D &dist, CLDistribution1D &cum_di
 } // namespace
 
 CLEqualizeHistogram::CLEqualizeHistogram()
-    : _histogram_kernel(), _border_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8)
+    : _histogram_kernel(support::cpp14::make_unique<CLHistogramKernel>()),
+      _border_histogram_kernel(support::cpp14::make_unique<CLHistogramBorderKernel>()),
+      _map_histogram_kernel(support::cpp14::make_unique<CLTableLookupKernel>()),
+      _hist(nr_bins, 0, max_range),
+      _cum_dist(nr_bins, 0, max_range),
+      _cd_lut(nr_bins, DataType::U8)
 {
 }
 
+CLEqualizeHistogram::~CLEqualizeHistogram() = default;
+
 void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -94,22 +104,22 @@ void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output)
 
 void CLEqualizeHistogram::configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output)
 {
-    _histogram_kernel.configure(compile_context, input, &_hist);
-    _border_histogram_kernel.configure(compile_context, input, &_hist);
-    _map_histogram_kernel.configure(compile_context, input, &_cd_lut, output);
+    _histogram_kernel->configure(compile_context, input, &_hist);
+    _border_histogram_kernel->configure(compile_context, input, &_hist);
+    _map_histogram_kernel->configure(compile_context, input, &_cd_lut, output);
 }
 
 void CLEqualizeHistogram::run()
 {
     // Calculate histogram of input.
-    CLScheduler::get().enqueue(_histogram_kernel, false);
+    CLScheduler::get().enqueue(*_histogram_kernel, false);
 
     // Calculate remaining pixels when image is not multiple of the elements of histogram kernel
-    CLScheduler::get().enqueue(_border_histogram_kernel, false);
+    CLScheduler::get().enqueue(*_border_histogram_kernel, false);
 
     // Calculate cumulative distribution of histogram and create LUT.
     calculate_cum_dist_and_lut(_hist, _cum_dist, _cd_lut);
 
     // Map input to output using created LUT.
-    CLScheduler::get().enqueue(_map_histogram_kernel);
+    CLScheduler::get().enqueue(*_map_histogram_kernel);
 }
diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
index 5236f620f1..6880c4845a 100644
--- a/src/runtime/CL/functions/CLErode.cpp
+++ b/src/runtime/CL/functions/CLErode.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLErode.h"
 
-#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLErodeKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLErode::configure(const CLCompileContext &compile_context, ICLTensor *inpu
     auto k = arm_compute::support::cpp14::make_unique<CLErodeKernel>();
     k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index 7d15d33ab5..a0078689ff 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -25,16 +25,29 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
+#include "src/core/utils/helpers/fft.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _run_scale(false)
+    : _memory_group(std::move(memory_manager)),
+      _digit_reverse_kernel(support::cpp14::make_unique<CLFFTDigitReverseKernel>()),
+      _fft_kernels(),
+      _scale_kernel(support::cpp14::make_unique<CLFFTScaleKernel>()),
+      _digit_reversed_input(),
+      _digit_reverse_indices(),
+      _num_ffts(0),
+      _run_scale(false)
 {
 }
 
+CLFFT1D::~CLFFT1D() = default;
+
 void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
@@ -62,12 +75,12 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
     TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
     _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
     _memory_group.manage(&_digit_reversed_input);
-    _digit_reverse_kernel.configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+    _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
 
     // Create and configure FFT kernels
     unsigned int Nx = 1;
     _num_ffts       = decomposed_vector.size();
-    _fft_kernels.resize(_num_ffts);
+    _fft_kernels.reserve(_num_ffts);
     for(unsigned int i = 0; i < _num_ffts; ++i)
     {
         const unsigned int radix_for_stage = decomposed_vector.at(i);
@@ -77,7 +90,8 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
         fft_kernel_info.radix          = radix_for_stage;
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
-        _fft_kernels[i].configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels.emplace_back(support::cpp14::make_unique<CLFFTRadixStageKernel>());
+        _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
@@ -88,7 +102,7 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
-        is_c2r ? _scale_kernel.configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+        is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -132,18 +146,18 @@ void CLFFT1D::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run digit reverse
-    CLScheduler::get().enqueue(_digit_reverse_kernel, false);
+    CLScheduler::get().enqueue(*_digit_reverse_kernel, false);
 
     // Run radix kernels
     for(unsigned int i = 0; i < _num_ffts; ++i)
     {
-        CLScheduler::get().enqueue(_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
+        CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
     }
 
     // Run output scaling
     if(_run_scale)
     {
-        CLScheduler::get().enqueue(_scale_kernel, true);
+        CLScheduler::get().enqueue(*_scale_kernel, true);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
index 7ab852fa98..1d444bb15d 100644
--- a/src/runtime/CL/functions/CLFFT2D.cpp
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -26,6 +26,9 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
 
 namespace arm_compute
 {
@@ -34,6 +37,8 @@ CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
 {
 }
 
+CLFFT2D::~CLFFT2D() = default;
+
 void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index 1def674bb6..5472e8469f 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -26,10 +26,20 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
+#include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
+#include "src/core/CL/kernels/CLFFTScaleKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/helpers/fft.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index 97f853fdea..110d2c3639 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -24,12 +24,14 @@
 #include "arm_compute/runtime/CL/functions/CLFastCorners.h"
 
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/kernels/CLFastCornersKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 #include <cstring>
@@ -38,9 +40,9 @@ using namespace arm_compute;
 
 CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _fast_corners_kernel(),
+      _fast_corners_kernel(support::cpp14::make_unique<CLFastCornersKernel>()),
       _suppr_func(),
-      _copy_array_kernel(),
+      _copy_array_kernel(support::cpp14::make_unique<CLCopyToArrayKernel>()),
       _output(),
       _suppr(),
       _win(),
@@ -52,6 +54,8 @@ CLFastCorners::CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
 {
 }
 
+CLFastCorners::~CLFastCorners() = default;
+
 void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
                               unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
 {
@@ -78,11 +82,11 @@ void CLFastCorners::configure(const CLCompileContext &compile_context, const ICL
     const bool update_number = (nullptr != _num_corners);
 
     _memory_group.manage(&_output);
-    _fast_corners_kernel.configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode);
+    _fast_corners_kernel->configure(compile_context, input, &_output, threshold, nonmax_suppression, border_mode);
 
     if(!_non_max)
     {
-        _copy_array_kernel.configure(compile_context, &_output, update_number, _corners, &_num_buffer);
+        _copy_array_kernel->configure(compile_context, &_output, update_number, _corners, &_num_buffer);
     }
     else
     {
@@ -90,7 +94,7 @@ void CLFastCorners::configure(const CLCompileContext &compile_context, const ICL
         _memory_group.manage(&_suppr);
 
         _suppr_func.configure(compile_context, &_output, &_suppr, border_mode);
-        _copy_array_kernel.configure(compile_context, &_suppr, update_number, _corners, &_num_buffer);
+        _copy_array_kernel->configure(compile_context, &_suppr, update_number, _corners, &_num_buffer);
 
         _suppr.allocator()->allocate();
     }
@@ -113,14 +117,14 @@ void CLFastCorners::run()
         q.enqueueUnmapMemObject(_output.cl_buffer(), out_buffer);
     }
 
-    CLScheduler::get().enqueue(_fast_corners_kernel, false);
+    CLScheduler::get().enqueue(*_fast_corners_kernel, false);
 
     if(_non_max)
     {
         _suppr_func.run();
     }
 
-    CLScheduler::get().enqueue(_copy_array_kernel, false);
+    CLScheduler::get().enqueue(*_copy_array_kernel, false);
 
     unsigned int get_num_corners = 0;
     q.enqueueReadBuffer(_num_buffer, CL_TRUE, 0, sizeof(unsigned int), &get_num_corners);
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index 6c0f1786f0..855ed8380a 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -23,8 +23,10 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFill.h"
 
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+
+#include "support/MemorySupport.h"
 
 #include <utility>
 
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
index c647bb6a02..27d132b842 100644
--- a/src/runtime/CL/functions/CLFillBorder.cpp
+++ b/src/runtime/CL/functions/CLFillBorder.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFillBorder.h"
 
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index a826541017..0646a0d3a0 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFlattenLayerKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 7ed92ac3df..770e6a3781 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLFloor.h"
 
-#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
+#include "src/core/CL/kernels/CLFloorKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 4f365b6a61..1acf3c7a8b 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -25,10 +25,23 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
+#include "support/Cast.h"
 #include "support/MemorySupport.h"
 
 #include <algorithm>
@@ -71,23 +84,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorIn
 
         if(activation_info.enabled())
         {
-            switch(activation_info.activation())
-            {
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    type_min = PixelValue(oq_unif.offset);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    type_min = PixelValue(oq_unif.offset);
-                    type_max = PixelValue(activation_info.a(), data_type, oq_info);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    type_min = PixelValue(activation_info.b(), data_type, oq_info);
-                    type_max = PixelValue(activation_info.a(), data_type, oq_info);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Activation function not supported.");
-                    break;
-            }
+            std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
         }
 
         // Set the GEMMLowp output stage info
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 825267c0fc..f018e5a8ae 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -28,14 +28,18 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLFuseBatchNormalization::CLFuseBatchNormalization()
-    : _fuse_bn_kernel()
+    : _fuse_bn_kernel(support::cpp14::make_unique<CLFuseBatchNormalizationKernel>())
 {
 }
 
+CLFuseBatchNormalization::~CLFuseBatchNormalization() = default;
+
 void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
                                          ICLTensor *fused_weights, ICLTensor *fused_bias,
                                          const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
@@ -49,7 +53,7 @@ void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context
                                          const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
                                          float epsilon, FuseBatchNormalizationType fbn_type)
 {
-    _fuse_bn_kernel.configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
@@ -62,6 +66,6 @@ Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, cons
 
 void CLFuseBatchNormalization::run()
 {
-    CLScheduler::get().enqueue(_fuse_bn_kernel, true);
+    CLScheduler::get().enqueue(*_fuse_bn_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 4a74630036..57a5f9739e 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -23,10 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Helpers.h"
@@ -35,12 +33,23 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/float_ops.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/ICLGEMMKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "support/Cast.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -48,25 +57,72 @@ using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::cl_gemm;
 using namespace arm_compute::utils::cast;
 
+namespace weights_transformations
+{
+CLGEMMReshapeRHSMatrixKernelManaged::CLGEMMReshapeRHSMatrixKernelManaged()
+    : _kernel(support::cpp14::make_unique<CLGEMMReshapeRHSMatrixKernel>())
+{
+}
+
+CLGEMMReshapeRHSMatrixKernelManaged::~CLGEMMReshapeRHSMatrixKernelManaged() = default;
+
+void CLGEMMReshapeRHSMatrixKernelManaged::run()
+{
+    _output.allocator()->allocate();
+    CLScheduler::get().enqueue(*_kernel, false);
+    _reshape_run = true;
+}
+
+void CLGEMMReshapeRHSMatrixKernelManaged::release()
+{
+    _output.allocator()->free();
+}
+
+ICLTensor *CLGEMMReshapeRHSMatrixKernelManaged::get_weights()
+{
+    return &_output;
+}
+
+uint32_t CLGEMMReshapeRHSMatrixKernelManaged::uid()
+{
+    return _uid;
+}
+
+void CLGEMMReshapeRHSMatrixKernelManaged::configure(const ICLTensor *input, GEMMRHSMatrixInfo info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, info);
+}
+
+void CLGEMMReshapeRHSMatrixKernelManaged::configure(const CLCompileContext &compile_context, const ICLTensor *input, GEMMRHSMatrixInfo info)
+{
+    _kernel->configure(compile_context, input, &_output, info);
+}
+} // namespace weights_transformations
+
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(std::move(memory_manager)),
       _weights_manager(weights_manager),
-      _mm_kernel(),
-      _reshape_lhs_kernel(),
-      _reshape_rhs_kernel(),
-      _reshape_rhs_kernel_managed(),
-      _mm_reshaped_kernel(),
-      _mm_reshaped_only_rhs_kernel(),
+      _mm_kernel(support::cpp14::make_unique<CLGEMMMatrixMultiplyKernel>()),
+      _reshape_lhs_kernel(support::cpp14::make_unique<CLGEMMReshapeLHSMatrixKernel>()),
+      _reshape_rhs_kernel(support::cpp14::make_unique<CLGEMMReshapeRHSMatrixKernel>()),
+      _reshape_rhs_kernel_managed(support::cpp14::make_unique<weights_transformations::CLGEMMReshapeRHSMatrixKernelManaged>()),
+      _mm_reshaped_kernel(support::cpp14::make_unique<CLGEMMMatrixMultiplyReshapedKernel>()),
+      _mm_reshaped_only_rhs_kernel(support::cpp14::make_unique<CLGEMMMatrixMultiplyReshapedOnlyRHSKernel>()),
+      _mm_reshaped_only_rhs_fallback_kernel(support::cpp14::make_unique<CLGEMMMatrixMultiplyReshapedOnlyRHSKernel>()),
       _tmp_a(),
       _tmp_b(),
       _original_b(nullptr),
+      _lhs(nullptr),
+      _dst(nullptr),
       _reshape_b_only_on_first_run(false),
       _is_prepared(false),
       _gemm_kernel_type(CLGEMMKernelType::NATIVE_V1)
 {
 }
 
-CLGEMMKernelType CLGEMM::select_gemm_kernel(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run)
+CLGEMM::~CLGEMM() = default;
+
+CLGEMMKernelType CLGEMM::select_gemm_kernel(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool reshape_b_only_on_first_run)
 {
     std::unique_ptr<ICLGEMMKernelSelection> gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target());
     ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_kernel.get());
@@ -75,6 +131,7 @@ CLGEMMKernelType CLGEMM::select_gemm_kernel(unsigned int m, unsigned int n, unsi
     params.m               = m;
     params.n               = n;
     params.k               = k;
+    params.b               = b;
     params.is_rhs_constant = reshape_b_only_on_first_run;
     params.data_type       = data_type;
 
@@ -90,15 +147,15 @@ void CLGEMM::configure_native_v1(const CLCompileContext &compile_context, const
     const GPUTarget    gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _mm_kernel.set_target(gpu_target);
+    _mm_kernel->set_target(gpu_target);
 
     GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());
 
     // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+    _mm_kernel->configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
 
     // Tune kernel statically
-    CLScheduler::get().tune_kernel_static(_mm_kernel);
+    CLScheduler::get().tune_kernel_static(*_mm_kernel);
 }
 
 void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
@@ -114,8 +171,8 @@ void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, cons
     int                mult_interleave4x4_height = 1;
 
     // Set the target for the kernels
-    _reshape_lhs_kernel.set_target(gpu_target);
-    _mm_kernel.set_target(gpu_target);
+    _reshape_lhs_kernel->set_target(gpu_target);
+    _mm_kernel->set_target(gpu_target);
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
@@ -150,24 +207,24 @@ void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, cons
     }
 
     // Configure interleave kernel
-    _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
+    _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
 
     // Configure transpose kernel
     ICLTensor *reshaped_rhs = &_tmp_b;
     if(_weights_manager && _weights_manager->are_weights_managed(b))
     {
-        _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
-        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+        _reshape_rhs_kernel_managed->configure(compile_context, b, rhs_info);
+        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, _reshape_rhs_kernel_managed.get()));
     }
     else
     {
-        _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
+        _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
     }
 
     // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
+    _mm_kernel->configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
 
-    CLScheduler::get().tune_kernel_static(_mm_kernel);
+    CLScheduler::get().tune_kernel_static(*_mm_kernel);
 
     // Allocate intermediate tensors
     _tmp_a.allocator()->allocate();
@@ -201,8 +258,8 @@ void CLGEMM::configure_reshaped_v2(const CLCompileContext &compile_context, cons
     kernel_info.activation_info         = gemm_info.activation_info();
 
     // Set the target for the kernels
-    _reshape_lhs_kernel.set_target(gpu_target);
-    _mm_kernel.set_target(gpu_target);
+    _reshape_lhs_kernel->set_target(gpu_target);
+    _mm_kernel->set_target(gpu_target);
 
     const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
 
@@ -226,21 +283,21 @@ void CLGEMM::configure_reshaped_v2(const CLCompileContext &compile_context, cons
     // Configure lhs_info and rhs_info
     std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
 
-    _reshape_lhs_kernel.configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+    _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
 
     ICLTensor *reshaped_rhs = &_tmp_b;
     if(_weights_manager && _weights_manager->are_weights_managed(b))
     {
-        _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
-        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+        _reshape_rhs_kernel_managed->configure(compile_context, b, rhs_info);
+        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, _reshape_rhs_kernel_managed.get()));
     }
     else
     {
-        _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
+        _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
     }
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_kernel.configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
     // Allocate intermediate tensors
     _tmp_a.allocator()->allocate();
@@ -274,7 +331,7 @@ void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context
     kernel_info.activation_info         = gemm_info.activation_info();
 
     // Set the target for the kernels
-    _mm_kernel.set_target(gpu_target);
+    _mm_kernel->set_target(gpu_target);
 
     const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
 
@@ -291,30 +348,31 @@ void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context
     std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
     ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
 
-    unsigned int m_internal = m;
-    unsigned int b_internal = batch_size;
-    if(reinterpret_input_as_3d)
-    {
-        m_internal = a->info()->dimension(1);
-        b_internal = a->info()->dimension(2);
-    }
-
     // Configure lhs_info and rhs_info
-    std::tie(lhs_info, rhs_info) = gemm_config->configure(m_internal, n, k, b_internal, data_type);
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
 
     ICLTensor *reshaped_rhs = &_tmp_b;
     if(_weights_manager && _weights_manager->are_weights_managed(b))
     {
-        _reshape_rhs_kernel_managed.configure(compile_context, b, rhs_info);
-        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, &_reshape_rhs_kernel_managed));
+        _reshape_rhs_kernel_managed->configure(compile_context, b, rhs_info);
+        reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, _reshape_rhs_kernel_managed.get()));
     }
     else
     {
-        _reshape_rhs_kernel.configure(compile_context, b, &_tmp_b, rhs_info);
+        _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
     }
 
-    // Configure and tune matrix multiply kernel
-    _mm_reshaped_only_rhs_kernel.configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true)
+    // During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have
+    // pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false
+
+    // Configure matrix multiply kernel with no y padding support
+    kernel_info.has_pad_y = false;
+    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+
+    // Configure matrix multiply kernel with y padding support
+    kernel_info.has_pad_y = true;
+    _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
     if(!_reshape_b_only_on_first_run && use_mm_b)
     {
@@ -489,6 +547,10 @@ Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
     // Validate matrix multiply
+    kernel_info.has_pad_y = false;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+
+    kernel_info.has_pad_y = true;
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
@@ -510,15 +572,18 @@ void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _is_prepared                 = gemm_info.retain_internal_weights();
     _original_b                  = b;
+    _lhs                         = a;
+    _dst                         = output;
 
     // Get the GPU target
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
     const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
     const unsigned int n                       = b->info()->dimension(0);
     const unsigned int k                       = a->info()->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
 
     // Select GEMMType
-    _gemm_kernel_type = select_gemm_kernel(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run);
+    _gemm_kernel_type = select_gemm_kernel(m, n, k, batch_size, a->info()->data_type(), _reshape_b_only_on_first_run);
 
     const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
 
@@ -560,9 +625,10 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
     const unsigned int n                       = b->dimension(0);
     const unsigned int k                       = a->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
 
     // Select GEMMType
-    CLGEMMKernelType gemm_kernel_type = select_gemm_kernel(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run());
+    CLGEMMKernelType gemm_kernel_type = select_gemm_kernel(m, n, k, batch_size, a->data_type(), gemm_info.reshape_b_only_on_first_run());
 
     const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
 
@@ -602,7 +668,6 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
 void CLGEMM::run()
 {
     prepare();
-
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run matrix multiply kernel
@@ -610,49 +675,49 @@ void CLGEMM::run()
     {
         case CLGEMMKernelType::NATIVE_V1:
         {
-            CLScheduler::get().enqueue(_mm_kernel, true);
+            CLScheduler::get().enqueue(*_mm_kernel, true);
             break;
         }
         case CLGEMMKernelType::RESHAPED_V1:
         {
             // Run interleave kernel
-            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+            CLScheduler::get().enqueue(*_reshape_lhs_kernel, false);
 
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
                 if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
                 {
-                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                    _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get());
                 }
                 else
                 {
-                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                    CLScheduler::get().enqueue(*_reshape_rhs_kernel, false);
                 }
             }
 
-            CLScheduler::get().enqueue(_mm_kernel, true);
+            CLScheduler::get().enqueue(*_mm_kernel, true);
             break;
         }
         case CLGEMMKernelType::RESHAPED:
         {
             // Run interleave kernel
-            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+            CLScheduler::get().enqueue(*_reshape_lhs_kernel, false);
 
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
                 if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
                 {
-                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                    _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get());
                 }
                 else
                 {
-                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                    CLScheduler::get().enqueue(*_reshape_rhs_kernel, false);
                 }
             }
 
-            CLScheduler::get().enqueue(_mm_reshaped_kernel, true);
+            CLScheduler::get().enqueue(*_mm_reshaped_kernel, true);
             break;
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
@@ -662,15 +727,27 @@ void CLGEMM::run()
                 // Run transpose kernel
                 if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
                 {
-                    _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                    _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get());
                 }
                 else
                 {
-                    CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                    CLScheduler::get().enqueue(*_reshape_rhs_kernel, false);
                 }
             }
+            // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
+            // Check if the lhs or dst tensors have padding
+            const unsigned int cross_plane_pad_lhs = _lhs->info()->padding().top + _lhs->info()->padding().bottom;
+            const unsigned int cross_plane_pad_dst = _dst->info()->padding().top + _dst->info()->padding().bottom;
 
-            CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, true);
+            bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);
+            if(has_pad_y)
+            {
+                CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_fallback_kernel, true);
+            }
+            else
+            {
+                CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, true);
+            }
             break;
         }
         default:
@@ -688,13 +765,13 @@ void CLGEMM::prepare()
         {
             if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
             {
-                _weights_manager->run(_original_b, &_reshape_rhs_kernel_managed);
+                _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get());
             }
             else
             {
                 // Run transpose kernel and mark original weights tensor as unused
                 _tmp_b.allocator()->allocate();
-                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+                CLScheduler::get().enqueue(*_reshape_rhs_kernel, false);
                 _original_b->mark_as_unused();
             }
         }
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index ee90b39c2b..4d26df5e43 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -27,10 +27,26 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLCol2ImKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "support/Cast.h"
+#include "support/MemorySupport.h"
 
 #include <cmath>
 #include <memory>
@@ -42,10 +58,12 @@ using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::utils::cast;
 
 CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel()
+    : _weights_reshape_kernel(support::cpp14::make_unique<CLWeightsReshapeKernel>())
 {
 }
 
+CLConvolutionLayerReshapeWeights::~CLConvolutionLayerReshapeWeights() = default;
+
 void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
 {
     configure(CLKernelLibrary::get().get_compile_context(), weights, biases, output, num_groups);
@@ -63,7 +81,7 @@ void CLConvolutionLayerReshapeWeights::configure(const CLCompileContext &compile
     const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
     const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr;
 
-    _weights_reshape_kernel.configure(compile_context, weights, biases_to_use, output, num_groups);
+    _weights_reshape_kernel->configure(compile_context, weights, biases_to_use, output, num_groups);
 
     output->info()->set_quantization_info(weights->info()->quantization_info());
 }
@@ -95,16 +113,18 @@ Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, co
 
 void CLConvolutionLayerReshapeWeights::run()
 {
-    CLScheduler::get().enqueue(_weights_reshape_kernel);
+    CLScheduler::get().enqueue(*_weights_reshape_kernel);
 }
 
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager, weights_manager),
-      _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false),
-      _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false)
+    : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(support::cpp14::make_unique<CLIm2ColKernel>()),
+      _mm_gemm(memory_manager, weights_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(support::cpp14::make_unique<CLCol2ImKernel>()), _activationlayer_function(), _original_weights(nullptr),
+      _im2col_output(), _weights_reshaped(), _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _is_prepared(false)
 {
 }
 
+CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
+
 void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
                                           const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
                                           int gemm_3d_depth, const ActivationLayerInfo &act_info)
@@ -229,8 +249,8 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
     _fuse_activation = true;
 
     // Set the GPU target for im2col and col2im
-    _im2col_kernel.set_target(CLScheduler::get().target());
-    _col2im_kernel.set_target(CLScheduler::get().target());
+    _im2col_kernel->set_target(CLScheduler::get().target());
+    _col2im_kernel->set_target(CLScheduler::get().target());
 
     const ICLTensor *gemm_input_to_use  = input;
     ICLTensor       *gemm_output_to_use = output;
@@ -292,11 +312,11 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
         _memory_group.manage(&_im2col_output);
 
         // Configure and tune im2col. im2col output shape is auto-initialized
-        _im2col_kernel.configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups);
+        _im2col_kernel->configure(compile_context, input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups);
 
         // Set quantization info
         _im2col_output.info()->set_quantization_info(input->info()->quantization_info());
-        CLScheduler::get().tune_kernel_static(_im2col_kernel);
+        CLScheduler::get().tune_kernel_static(*_im2col_kernel);
 
         // Update GEMM input
         gemm_input_to_use = &_im2col_output;
@@ -389,8 +409,8 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context,
     if(!_skip_col2im)
     {
         // Configure and tune Col2Im
-        _col2im_kernel.configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
-        CLScheduler::get().tune_kernel_static(_col2im_kernel);
+        _col2im_kernel->configure(compile_context, gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
+        CLScheduler::get().tune_kernel_static(*_col2im_kernel.get());
     }
 
     if(!_skip_col2im)
@@ -610,7 +630,7 @@ void CLGEMMConvolutionLayer::run()
     // Run im2col
     if(!_skip_im2col)
     {
-        CLScheduler::get().enqueue(_im2col_kernel);
+        CLScheduler::get().enqueue(*_im2col_kernel);
     }
 
     // Runs CLGEMM or CLGEMMLowpMatrixMultiplyCore functions
@@ -628,7 +648,7 @@ void CLGEMMConvolutionLayer::run()
     // Reshape output matrix
     if(!_skip_col2im)
     {
-        CLScheduler::get().enqueue(_col2im_kernel, false);
+        CLScheduler::get().enqueue(*_col2im_kernel.get(), false);
     }
 
     //Run Activation Layer if we cannot fuse in GEMM
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 5fc9c17bef..4d277f0982 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -28,8 +28,23 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "support/MemorySupport.h"
 
-#include <memory>
 #include <tuple>
 
 namespace arm_compute
@@ -99,7 +114,7 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage
       _permute_weights_to_nhwc(),
       _reshape_weights(),
       _transpose_weights(),
-      _deconv_reshape(),
+      _deconv_reshape(support::cpp14::make_unique<CLDeconvolutionReshapeOutputKernel>()),
       _slice_gemm(),
       _gemmlowp_final(),
       _reshaped_weights(),
@@ -116,6 +131,8 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage
 {
 }
 
+CLGEMMDeconvolutionLayer::~CLGEMMDeconvolutionLayer() = default;
+
 Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
@@ -317,7 +334,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     }
 
     // Configure a Col2Im call to reshape the output of GEMM
-    _deconv_reshape.configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+    _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
     _gemm_output.allocator()->allocate();
 
     if(_is_quantized)
@@ -357,7 +374,7 @@ void CLGEMMDeconvolutionLayer::run()
         _mm_gemm.run();
     }
 
-    CLScheduler::get().enqueue(_deconv_reshape, false);
+    CLScheduler::get().enqueue(*_deconv_reshape, false);
 
     if(_is_quantized)
     {
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 30dce5b8fe..d3d80a39e3 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -24,8 +24,6 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
@@ -35,7 +33,18 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
+#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -70,14 +79,14 @@ inline bool is_gemm_reshaped(unsigned int m, unsigned int n, unsigned int k, Dat
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _weights_to_qasymm8(),
-      _mm_native_kernel(),
-      _mm_reshaped_only_rhs_kernel(),
-      _mtx_b_reshape_kernel(),
-      _mtx_a_reduction_kernel(),
-      _mtx_b_reduction_kernel(),
-      _offset_contribution_kernel(),
-      _offset_contribution_output_stage_kernel(),
+      _weights_to_qasymm8(support::cpp14::make_unique<CLDepthConvertLayerKernel>()),
+      _mm_native_kernel(support::cpp14::make_unique<CLGEMMLowpMatrixMultiplyNativeKernel>()),
+      _mm_reshaped_only_rhs_kernel(support::cpp14::make_unique<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>()),
+      _mtx_b_reshape_kernel(support::cpp14::make_unique<CLGEMMReshapeRHSMatrixKernel>()),
+      _mtx_a_reduction_kernel(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _mtx_b_reduction_kernel(support::cpp14::make_unique<CLGEMMLowpMatrixBReductionKernel>()),
+      _offset_contribution_kernel(support::cpp14::make_unique<CLGEMMLowpOffsetContributionKernel>()),
+      _offset_contribution_output_stage_kernel(support::cpp14::make_unique<CLGEMMLowpOffsetContributionOutputStageKernel>()),
       _qasymm8_weights(),
       _vector_sum_col(),
       _vector_sum_row(),
@@ -99,6 +108,8 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
 {
 }
 
+CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default;
+
 void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
@@ -124,8 +135,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _mm_native_kernel.set_target(gpu_target);
-    _mm_reshaped_only_rhs_kernel.set_target(gpu_target);
+    _mm_native_kernel->set_target(gpu_target);
+    _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
 
     GEMMRHSMatrixInfo rhs_info;
     GEMMLHSMatrixInfo lhs_info;
@@ -149,7 +160,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         TensorInfo weights_info(*b->info());
         weights_info.set_data_type(DataType::QASYMM8);
         _qasymm8_weights.allocator()->init(weights_info);
-        _weights_to_qasymm8.configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
+        _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
     }
 
     const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
@@ -167,7 +178,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
         // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
     }
 
     // Using default reduction info
@@ -184,7 +195,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         }
 
         // Configure Matrix B reduction kernel
-        _mtx_b_reduction_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
+        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
     }
 
     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -195,7 +206,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         _memory_group.manage(&_vector_sum_row);
 
         // Configure matrix A reduction kernel
-        _mtx_a_reduction_kernel.configure(compile_context, a, &_vector_sum_row, reduction_info);
+        _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
     }
 
     GEMMKernelInfo gemm_kernel_info;
@@ -225,8 +236,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                   _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+            _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                                    _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
         else
         {
@@ -236,7 +247,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
 
             if(_is_gemm_reshaped)
             {
-                _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+                _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
             }
             else
             {
@@ -244,11 +255,11 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
                 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
                 // Configure matrix multiply kernel
-                _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+                _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
 
-                _offset_contribution_output_stage_kernel.configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output,
-                                                                   a->info()->dimension(0),
-                                                                   _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+                _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output,
+                                                                    a->info()->dimension(0),
+                                                                    _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
                 _mm_result_s32.allocator()->allocate();
             }
         }
@@ -269,7 +280,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         if(_is_gemm_reshaped)
         {
             // Configure and tune matrix multiply kernel
-            _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info);
+            _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info);
         }
         else
         {
@@ -277,12 +288,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
             std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
             // Configure matrix multiply kernel
-            _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
         }
 
         // Configure offset contribution kernel
-        _offset_contribution_kernel.configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset,
-                                              _b_offset);
+        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset,
+                                               _b_offset);
     }
 
     // Allocate tensors
@@ -488,40 +499,40 @@ void CLGEMMLowpMatrixMultiplyCore::run()
         if(!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
-            CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+            CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false);
         }
     }
 
     // Run matrix B reduction kernel only if _a_offset is not equal to 0
     if(_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
-        CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+        CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false);
     }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
     if(_b_offset != 0)
     {
-        CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
+        CLScheduler::get().enqueue(*_mtx_a_reduction_kernel, false);
     }
 
     // Run matrix multiply
     if(_is_gemm_reshaped)
     {
-        CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false);
+        CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, false);
     }
     else
     {
-        CLScheduler::get().enqueue(_mm_native_kernel, false);
+        CLScheduler::get().enqueue(*_mm_native_kernel, false);
     }
     if(_run_output_stage)
     {
         // Run offset contribution/output stage kernel
-        CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true);
+        CLScheduler::get().enqueue(*_offset_contribution_output_stage_kernel, true);
     }
     if(_run_offset_contribution)
     {
         // Run offset contribution kernel
-        CLScheduler::get().enqueue(_offset_contribution_kernel, true);
+        CLScheduler::get().enqueue(*_offset_contribution_kernel, true);
     }
 }
 
@@ -532,7 +543,7 @@ void CLGEMMLowpMatrixMultiplyCore::prepare()
         if(_convert_to_qasymm8)
         {
             _qasymm8_weights.allocator()->allocate();
-            CLScheduler::get().enqueue(_weights_to_qasymm8, false);
+            CLScheduler::get().enqueue(*_weights_to_qasymm8, false);
         }
 
         if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
@@ -541,7 +552,7 @@ void CLGEMMLowpMatrixMultiplyCore::prepare()
 
             // Run reshape kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
-            CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+            CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false);
             _original_b->mark_as_unused();
         }
 
@@ -549,7 +560,7 @@ void CLGEMMLowpMatrixMultiplyCore::prepare()
         if(_a_offset != 0 && _reshape_b_only_on_first_run)
         {
             _vector_sum_col.allocator()->allocate();
-            CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+            CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false);
         }
 
         CLScheduler::get().queue().finish();
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index a499e1858d..f9c5247d2d 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -24,13 +24,14 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
 #include "support/MemorySupport.h"
 
+#include <algorithm>
+
 namespace arm_compute
 {
 void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
@@ -44,39 +45,59 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const CLComp
                                                                     int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
                                                                     int min, int max)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
-    k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+    GEMMLowpOutputStageInfo info{};
+    info.gemmlowp_multiplier = result_fixedpoint_multiplier;
+    info.gemmlowp_shift      = result_shift;
+    info.gemmlowp_offset     = result_offset_after_shift;
+    info.gemmlowp_min_bound  = min;
+    info.gemmlowp_max_bound  = max;
+    info.output_data_type    = DataType::QASYMM8;
+    auto k                   = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>();
+    k->configure(compile_context, input, bias, output, &info);
     _kernel = std::move(k);
 }
 
 Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
                                                                      int min, int max)
 {
-    return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
+    GEMMLowpOutputStageInfo info{};
+    info.gemmlowp_min_bound = min;
+    info.gemmlowp_max_bound = max;
+    info.output_data_type   = DataType::QASYMM8;
+    return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
 }
 
 void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
                                                                    int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
                                                                    int min, int max)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-    k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-    _kernel = std::move(k);
+    configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
 }
 
 void CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
                                                                    int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
                                                                    int min, int max)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-    k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+    GEMMLowpOutputStageInfo info{};
+    info.gemmlowp_multiplier = result_fixedpoint_multiplier;
+    info.gemmlowp_shift      = result_shift;
+    info.gemmlowp_offset     = result_offset_after_shift;
+    info.gemmlowp_min_bound  = min;
+    info.gemmlowp_max_bound  = max;
+    info.output_data_type    = DataType::QASYMM8_SIGNED;
+    auto k                   = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>();
+    k->configure(compile_context, input, bias, output, &info);
     _kernel = std::move(k);
 }
 
 Status CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
                                                                     int min, int max)
 {
-    return CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
+    GEMMLowpOutputStageInfo info{};
+    info.gemmlowp_min_bound = min;
+    info.gemmlowp_max_bound = max;
+    info.output_data_type   = DataType::QASYMM8_SIGNED;
+    return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
 }
 
 void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
@@ -90,15 +111,25 @@ void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const CLComp
                                                                     int result_fixedpoint_multiplier, int result_shift,
                                                                     int min, int max)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
-    k->configure(compile_context, input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
+    GEMMLowpOutputStageInfo info{};
+    info.gemmlowp_multiplier = result_fixedpoint_multiplier;
+    info.gemmlowp_shift      = result_shift;
+    info.gemmlowp_min_bound  = min;
+    info.gemmlowp_max_bound  = max;
+    info.output_data_type    = DataType::QSYMM16;
+    auto k                   = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>();
+    k->configure(compile_context, input, bias, output, &info);
     _kernel = std::move(k);
 }
 
 Status CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
                                                                      int min, int max)
 {
-    return CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max);
+    GEMMLowpOutputStageInfo info{};
+    info.gemmlowp_min_bound = min;
+    info.gemmlowp_max_bound = max;
+    info.output_data_type   = DataType::QSYMM16;
+    return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
 }
 
 void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
@@ -114,32 +145,9 @@ void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, c
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
         {
-            switch(info.output_data_type)
-            {
-                case DataType::QASYMM8:
-                {
-                    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
-                    k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                    _kernel = std::move(k);
-                    break;
-                }
-                case DataType::QASYMM8_SIGNED:
-                {
-                    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-                    k->configure(compile_context, input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                    _kernel = std::move(k);
-                    break;
-                }
-                case DataType::QSYMM16:
-                {
-                    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
-                    k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                    _kernel = std::move(k);
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported output data type.");
-            }
+            auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel>();
+            k->configure(compile_context, input, bias, output, &info);
+            _kernel = std::move(k);
             break;
         }
         case GEMMLowpOutputStageType::QUANTIZE_DOWN:
@@ -169,19 +177,7 @@ Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorIn
     switch(info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
-        {
-            switch(output->data_type())
-            {
-                case DataType::QASYMM8:
-                    return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                case DataType::QASYMM8_SIGNED:
-                    return CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                case DataType::QSYMM16:
-                    return CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
-                default:
-                    return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
-            }
-        }
+            return CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(input, bias, output, &info);
         case GEMMLowpOutputStageType::QUANTIZE_DOWN:
             return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
index d9b6679ebf..de6296f6a3 100644
--- a/src/runtime/CL/functions/CLGather.cpp
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLGather.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
index c62e200315..97db9ba06d 100644
--- a/src/runtime/CL/functions/CLGaussian3x3.cpp
+++ b/src/runtime/CL/functions/CLGaussian3x3.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
 
-#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGaussian3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLGaussian3x3::configure(const CLCompileContext &compile_context, ICLTensor
     auto k = arm_compute::support::cpp14::make_unique<CLGaussian3x3Kernel>();
     k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index 1fe2fddfb6..f7470d4ecf 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -24,22 +24,30 @@
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "support/MemorySupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
 CLGaussian5x5::CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
+    : _memory_group(std::move(memory_manager)),
+      _kernel_hor(support::cpp14::make_unique<CLGaussian5x5HorKernel>()),
+      _kernel_vert(support::cpp14::make_unique<CLGaussian5x5VertKernel>()),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>()),
+      _tmp()
 {
 }
 
+CLGaussian5x5::~CLGaussian5x5() = default;
+
 void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, border_mode, constant_border_value);
@@ -55,9 +63,9 @@ void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor
     _memory_group.manage(&_tmp);
 
     // Configure kernels
-    _kernel_hor.configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED);
-    _kernel_vert.configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED);
-    _border_handler.configure(compile_context, input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _kernel_hor->configure(compile_context, input, &_tmp, border_mode == BorderMode::UNDEFINED);
+    _kernel_vert->configure(compile_context, &_tmp, output, border_mode == BorderMode::UNDEFINED);
+    _border_handler->configure(compile_context, input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 
     // Allocate intermediate buffers
     _tmp.allocator()->allocate();
@@ -65,10 +73,10 @@ void CLGaussian5x5::configure(const CLCompileContext &compile_context, ICLTensor
 
 void CLGaussian5x5::run()
 {
-    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(*_border_handler, false);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    CLScheduler::get().enqueue(_kernel_hor, false);
-    CLScheduler::get().enqueue(_kernel_vert);
+    CLScheduler::get().enqueue(*_kernel_hor, false);
+    CLScheduler::get().enqueue(*_kernel_vert);
 }
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 297d535ba5..66b85352c1 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -24,19 +24,21 @@
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-
 #include "arm_compute/runtime/CL/CLPyramid.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
+#include "src/core/CL/kernels/CLScaleKernel.h"
+#include "support/MemorySupport.h"
 
 #include <cstddef>
 
@@ -47,6 +49,8 @@ CLGaussianPyramid::CLGaussianPyramid()
 {
 }
 
+CLGaussianPyramid::~CLGaussianPyramid() = default;
+
 CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
     : _horizontal_border_handler(),
       _vertical_border_handler(),
@@ -55,6 +59,8 @@ CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
 {
 }
 
+CLGaussianPyramidHalf::~CLGaussianPyramidHalf() = default;
+
 void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, pyramid, border_mode, constant_border_value);
@@ -80,10 +86,10 @@ void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, I
 
     if(num_levels > 1)
     {
-        _horizontal_border_handler.resize(num_levels - 1);
-        _vertical_border_handler.resize(num_levels - 1);
-        _horizontal_reduction.resize(num_levels - 1);
-        _vertical_reduction.resize(num_levels - 1);
+        _horizontal_border_handler.reserve(num_levels - 1);
+        _vertical_border_handler.reserve(num_levels - 1);
+        _horizontal_reduction.reserve(num_levels - 1);
+        _vertical_reduction.reserve(num_levels - 1);
 
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -95,16 +101,20 @@ void CLGaussianPyramidHalf::configure(const CLCompileContext &compile_context, I
         for(size_t i = 0; i < num_levels - 1; ++i)
         {
             /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
+            _horizontal_reduction.emplace_back(support::cpp14::make_unique<CLGaussianPyramidHorKernel>());
+            _horizontal_reduction.back()->configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
 
             /* Configure vertical kernel */
-            _vertical_reduction[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
+            _vertical_reduction.emplace_back(support::cpp14::make_unique<CLGaussianPyramidVertKernel>());
+            _vertical_reduction.back()->configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
 
             /* Configure border */
-            _horizontal_border_handler[i].configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+            _horizontal_border_handler.emplace_back(support::cpp14::make_unique<CLFillBorderKernel>());
+            _horizontal_border_handler.back()->configure(compile_context, _pyramid->get_pyramid_level(i), _horizontal_reduction.back()->border_size(), border_mode, PixelValue(constant_border_value));
 
             /* Configure border */
-            _vertical_border_handler[i].configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
+            _vertical_border_handler.emplace_back(support::cpp14::make_unique<CLFillBorderKernel>());
+            _vertical_border_handler.back()->configure(compile_context, _tmp.get_pyramid_level(i), _vertical_reduction.back()->border_size(), border_mode, PixelValue(pixel_value_u16));
         }
         _tmp.allocate();
     }
@@ -127,10 +137,10 @@ void CLGaussianPyramidHalf::run()
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        CLScheduler::get().enqueue(_horizontal_border_handler[i], false);
-        CLScheduler::get().enqueue(_horizontal_reduction[i], false);
-        CLScheduler::get().enqueue(_vertical_border_handler[i], false);
-        CLScheduler::get().enqueue(_vertical_reduction[i], false);
+        CLScheduler::get().enqueue(*_horizontal_border_handler[i], false);
+        CLScheduler::get().enqueue(*_horizontal_reduction[i], false);
+        CLScheduler::get().enqueue(*_vertical_border_handler[i], false);
+        CLScheduler::get().enqueue(*_vertical_reduction[i], false);
     }
 }
 
@@ -163,7 +173,7 @@ void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, IC
     if(num_levels > 1)
     {
         _gauss5x5.resize(num_levels - 1);
-        _scale_nearest.resize(num_levels - 1);
+        _scale_nearest.reserve(num_levels - 1);
 
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
 
@@ -175,7 +185,8 @@ void CLGaussianPyramidOrb::configure(const CLCompileContext &compile_context, IC
             _gauss5x5[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
 
             /* Configure scale image kernel */
-            _scale_nearest[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, PixelValue(), SamplingPolicy::CENTER });
+            _scale_nearest.emplace_back(support::cpp14::make_unique<CLScaleKernel>());
+            _scale_nearest.back()->configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, PixelValue(), SamplingPolicy::CENTER });
         }
 
         _tmp.allocate();
@@ -199,6 +210,6 @@ void CLGaussianPyramidOrb::run()
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
         _gauss5x5[i].run();
-        CLScheduler::get().enqueue(_scale_nearest[i]);
+        CLScheduler::get().enqueue(*_scale_nearest[i]);
     }
 }
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 45dc402449..87bf39030a 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -25,21 +25,29 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "src/core/CL/kernels/CLDequantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
+#include "src/core/CL/kernels/CLPermuteKernel.h"
+#include "src/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager),
-      _permute_deltas_kernel(),
+      _permute_deltas_kernel(support::cpp14::make_unique<CLPermuteKernel>()),
       _flatten_deltas(),
-      _permute_scores_kernel(),
+      _permute_scores_kernel(support::cpp14::make_unique<CLPermuteKernel>()),
       _flatten_scores(),
-      _compute_anchors_kernel(),
-      _bounding_box_kernel(),
-      _pad_kernel(),
-      _dequantize_anchors(),
-      _dequantize_deltas(),
-      _quantize_all_proposals(),
+      _compute_anchors_kernel(support::cpp14::make_unique<CLComputeAllAnchorsKernel>()),
+      _bounding_box_kernel(support::cpp14::make_unique<CLBoundingBoxTransformKernel>()),
+      _pad_kernel(support::cpp14::make_unique<CLPadLayerKernel>()),
+      _dequantize_anchors(support::cpp14::make_unique<CLDequantizationLayerKernel>()),
+      _dequantize_deltas(support::cpp14::make_unique<CLDequantizationLayerKernel>()),
+      _quantize_all_proposals(support::cpp14::make_unique<CLQuantizationLayerKernel>()),
       _cpp_nms(memory_manager),
       _is_nhwc(false),
       _is_qasymm8(false),
@@ -61,6 +69,8 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManage
 {
 }
 
+CLGenerateProposalsLayer::~CLGenerateProposalsLayer() = default;
+
 void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
@@ -91,7 +101,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
-    _compute_anchors_kernel.configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
     _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
@@ -101,7 +111,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
     if(!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas_kernel.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_deltas_kernel->configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
         _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
@@ -118,7 +128,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
     if(!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores_kernel.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_scores_kernel->configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
         _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
@@ -136,18 +146,18 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
         _memory_group.manage(&_all_anchors_f32);
         _memory_group.manage(&_deltas_flattened_f32);
         // Dequantize anchors to float
-        _dequantize_anchors.configure(compile_context, &_all_anchors, &_all_anchors_f32);
+        _dequantize_anchors->configure(compile_context, &_all_anchors, &_all_anchors_f32);
         _all_anchors.allocator()->allocate();
         anchors_to_use = &_all_anchors_f32;
         // Dequantize deltas to float
-        _dequantize_deltas.configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32);
+        _dequantize_deltas->configure(compile_context, &_deltas_flattened, &_deltas_flattened_f32);
         _deltas_flattened.allocator()->allocate();
         deltas_to_use = &_deltas_flattened_f32;
     }
     // Bounding box transform
     _memory_group.manage(&_all_proposals);
     BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
-    _bounding_box_kernel.configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
+    _bounding_box_kernel->configure(compile_context, anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
     deltas_to_use->allocator()->allocate();
     anchors_to_use->allocator()->allocate();
 
@@ -157,7 +167,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
         _memory_group.manage(&_all_proposals_quantized);
         // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
         _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
-        _quantize_all_proposals.configure(compile_context, &_all_proposals, &_all_proposals_quantized);
+        _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized);
         _all_proposals.allocator()->allocate();
         _all_proposals_to_use = &_all_proposals_quantized;
     }
@@ -192,7 +202,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad_kernel.configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
     _proposals_4_roi_values.allocator()->allocate();
 }
 
@@ -342,34 +352,34 @@ void CLGenerateProposalsLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Compute all the anchors
-    CLScheduler::get().enqueue(_compute_anchors_kernel, false);
+    CLScheduler::get().enqueue(*_compute_anchors_kernel, false);
 
     // Transpose and reshape the inputs
     if(!_is_nhwc)
     {
-        CLScheduler::get().enqueue(_permute_deltas_kernel, false);
-        CLScheduler::get().enqueue(_permute_scores_kernel, false);
+        CLScheduler::get().enqueue(*_permute_deltas_kernel, false);
+        CLScheduler::get().enqueue(*_permute_scores_kernel, false);
     }
     _flatten_deltas.run();
     _flatten_scores.run();
 
     if(_is_qasymm8)
     {
-        CLScheduler::get().enqueue(_dequantize_anchors, false);
-        CLScheduler::get().enqueue(_dequantize_deltas, false);
+        CLScheduler::get().enqueue(*_dequantize_anchors, false);
+        CLScheduler::get().enqueue(*_dequantize_deltas, false);
     }
 
     // Build the boxes
-    CLScheduler::get().enqueue(_bounding_box_kernel, false);
+    CLScheduler::get().enqueue(*_bounding_box_kernel, false);
 
     if(_is_qasymm8)
     {
-        CLScheduler::get().enqueue(_quantize_all_proposals, false);
+        CLScheduler::get().enqueue(*_quantize_all_proposals, false);
     }
 
     // Non maxima suppression
     run_cpp_nms_kernel();
     // Add dummy batch indexes
-    CLScheduler::get().enqueue(_pad_kernel, true);
+    CLScheduler::get().enqueue(*_pad_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index 21fa6690ea..80026532ab 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -28,14 +28,26 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLHOGDescriptor::CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+    : _memory_group(std::move(memory_manager)),
+      _gradient(),
+      _orient_bin(support::cpp14::make_unique<CLHOGOrientationBinningKernel>()),
+      _block_norm(support::cpp14::make_unique<CLHOGBlockNormalizationKernel>()),
+      _mag(),
+      _phase(),
+      _hog_space()
 {
 }
 
+CLHOGDescriptor::~CLHOGDescriptor() = default;
+
 void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, hog, border_mode, constant_border_value);
@@ -87,10 +99,10 @@ void CLHOGDescriptor::configure(const CLCompileContext &compile_context, ICLTens
     _memory_group.manage(&_hog_space);
 
     // Initialise orientation binning kernel
-    _orient_bin.configure(compile_context, &_mag, &_phase, &_hog_space, hog->info());
+    _orient_bin->configure(compile_context, &_mag, &_phase, &_hog_space, hog->info());
 
     // Initialize HOG norm kernel
-    _block_norm.configure(compile_context, &_hog_space, output, hog->info());
+    _block_norm->configure(compile_context, &_hog_space, output, hog->info());
 
     // Allocate intermediate tensors
     _mag.allocator()->allocate();
@@ -106,8 +118,8 @@ void CLHOGDescriptor::run()
     _gradient.run();
 
     // Run orientation binning
-    CLScheduler::get().enqueue(_orient_bin, false);
+    CLScheduler::get().enqueue(*_orient_bin, false);
 
     // Run block normalization
-    CLScheduler::get().enqueue(_block_norm);
+    CLScheduler::get().enqueue(*_block_norm);
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
index 9188f654dc..07ae8151c0 100644
--- a/src/runtime/CL/functions/CLHOGDetector.cpp
+++ b/src/runtime/CL/functions/CLHOGDetector.cpp
@@ -23,19 +23,22 @@
  */
 #include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
 
-#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 
 using namespace arm_compute;
 
 CLHOGDetector::CLHOGDetector()
-    : _hog_detector_kernel(), _detection_windows(nullptr), _num_detection_windows()
+    : _hog_detector_kernel(support::cpp14::make_unique<CLHOGDetectorKernel>()), _detection_windows(nullptr), _num_detection_windows()
 {
 }
 
+CLHOGDetector::~CLHOGDetector() = default;
+
 void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, hog, detection_windows, detection_window_stride, threshold, idx_class);
@@ -50,7 +53,7 @@ void CLHOGDetector::configure(const CLCompileContext &compile_context, const ICL
     _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
 
     // Configure HOGDetectorKernel
-    _hog_detector_kernel.configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
+    _hog_detector_kernel->configure(compile_context, input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
 }
 
 void CLHOGDetector::run()
@@ -62,7 +65,7 @@ void CLHOGDetector::run()
     q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows);
 
     // Run CLHOGDetectorKernel
-    CLScheduler::get().enqueue(_hog_detector_kernel);
+    CLScheduler::get().enqueue(*_hog_detector_kernel);
 
     // Read number of detections
     unsigned int num_detection_windows = 0;
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index 934d1f6351..5f3b9cf529 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -26,11 +26,18 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLHOGGradient::CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _derivative(), _mag_phase(), _gx(), _gy()
+    : _memory_group(std::move(memory_manager)),
+      _derivative(),
+      _mag_phase(support::cpp14::make_unique<CLMagnitudePhaseKernel>()),
+      _gx(),
+      _gy()
 {
 }
 
@@ -63,11 +70,11 @@ void CLHOGGradient::configure(const CLCompileContext &compile_context, ICLTensor
     // Initialise magnitude/phase kernel
     if(PhaseType::UNSIGNED == phase_type)
     {
-        _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
+        _mag_phase->configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
     }
     else
     {
-        _mag_phase.configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
+        _mag_phase->configure(compile_context, &_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
     }
 
     // Allocate intermediate tensors
@@ -83,5 +90,5 @@ void CLHOGGradient::run()
     _derivative.run();
 
     // Run magnitude/phase kernel
-    CLScheduler::get().enqueue(_mag_phase);
+    CLScheduler::get().enqueue(*_mag_phase);
 }
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index 51db43cd71..dfc90537cf 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -30,6 +30,11 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/Scheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "src/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
@@ -52,6 +57,8 @@ CLHOGMultiDetection::CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_
 {
 }
 
+CLHOGMultiDetection::~CLHOGMultiDetection() = default;
+
 void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
                                     uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
 {
@@ -135,8 +142,8 @@ void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICL
     _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
     _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
 
-    _orient_bin_kernel.resize(_num_orient_bin_kernel);
-    _block_norm_kernel.resize(_num_block_norm_kernel);
+    _orient_bin_kernel.reserve(_num_orient_bin_kernel);
+    _block_norm_kernel.reserve(_num_block_norm_kernel);
     _hog_detect_kernel.resize(_num_hog_detect_kernel);
     _hog_space.resize(_num_orient_bin_kernel);
     _hog_norm_space.resize(_num_block_norm_kernel);
@@ -181,7 +188,8 @@ void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICL
         _memory_group.manage(&_hog_space[i]);
 
         // Initialise orientation binning kernel
-        _orient_bin_kernel[i].configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
+        _orient_bin_kernel.emplace_back(support::cpp14::make_unique<CLHOGOrientationBinningKernel>());
+        _orient_bin_kernel.back()->configure(compile_context, &_mag, &_phase, &_hog_space[i], multi_hog->model(idx_multi_hog)->info());
     }
 
     // Allocate intermediate tensors
@@ -202,7 +210,8 @@ void CLHOGMultiDetection::configure(const CLCompileContext &compile_context, ICL
         _memory_group.manage(&_hog_norm_space[i]);
 
         // Initialize block normalization kernel
-        _block_norm_kernel[i].configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
+        _block_norm_kernel.emplace_back(support::cpp14::make_unique<CLHOGBlockNormalizationKernel>());
+        _block_norm_kernel.back()->configure(compile_context, &_hog_space[idx_orient_bin], &_hog_norm_space[i], multi_hog->model(idx_multi_hog)->info());
     }
 
     // Allocate intermediate tensors
@@ -248,13 +257,13 @@ void CLHOGMultiDetection::run()
     // Run orientation binning kernel
     for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
     {
-        CLScheduler::get().enqueue(_orient_bin_kernel[i], false);
+        CLScheduler::get().enqueue(*_orient_bin_kernel[i], false);
     }
 
     // Run block normalization kernel
     for(size_t i = 0; i < _num_block_norm_kernel; ++i)
     {
-        CLScheduler::get().enqueue(_block_norm_kernel[i], false);
+        CLScheduler::get().enqueue(*_block_norm_kernel[i], false);
     }
 
     // Run HOG detector kernel
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 45b93a5be0..9d8ebceb30 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -24,8 +24,6 @@
 #include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
 
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
@@ -35,6 +33,10 @@
 #include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 #include "arm_compute/runtime/Scheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <cmath>
@@ -45,12 +47,12 @@ using namespace arm_compute;
 CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
       _sobel(nullptr),
-      _harris_score(),
+      _harris_score(support::cpp14::make_unique<CLHarrisScoreKernel>()),
       _non_max_suppr(),
       _candidates(),
       _sort_euclidean(),
-      _border_gx(),
-      _border_gy(),
+      _border_gx(support::cpp14::make_unique<CLFillBorderKernel>()),
+      _border_gy(support::cpp14::make_unique<CLFillBorderKernel>()),
       _gx(),
       _gy(),
       _score(),
@@ -61,6 +63,8 @@ CLHarrisCorners::CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager)
 {
 }
 
+CLHarrisCorners::~CLHarrisCorners() = default;
+
 void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist,
                                 float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
                                 BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
@@ -133,11 +137,11 @@ void CLHarrisCorners::configure(const CLCompileContext &compile_context, ICLImag
     _memory_group.manage(&_score);
 
     // Set/init Harris Score kernel accordingly with block_size
-    _harris_score.configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+    _harris_score->configure(compile_context, &_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
 
     // Configure border filling using harris score kernel's block size
-    _border_gx.configure(compile_context, &_gx, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
-    _border_gy.configure(compile_context, &_gy, _harris_score.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_gx->configure(compile_context, &_gx, _harris_score->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_gy->configure(compile_context, &_gy, _harris_score->border_size(), border_mode, PixelValue(constant_border_value));
 
     // Allocate intermediate buffers
     _gx.allocator()->allocate();
@@ -175,11 +179,11 @@ void CLHarrisCorners::run()
     _sobel->run();
 
     // Fill border before harris score kernel
-    CLScheduler::get().enqueue(_border_gx, false);
-    CLScheduler::get().enqueue(_border_gy, false);
+    CLScheduler::get().enqueue(*_border_gx, false);
+    CLScheduler::get().enqueue(*_border_gy, false);
 
     // Run harris score kernel
-    CLScheduler::get().enqueue(_harris_score, false);
+    CLScheduler::get().enqueue(*_harris_score, false);
 
     // Run non-maxima suppression
     _non_max_suppr.run();
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index fce1fe43a2..bd680f448d 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -23,8 +23,10 @@
  */
 #include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp
index 8561494242..41e47e77c7 100644
--- a/src/runtime/CL/functions/CLIntegralImage.cpp
+++ b/src/runtime/CL/functions/CLIntegralImage.cpp
@@ -23,16 +23,20 @@
  */
 #include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
 
-#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLIntegralImageKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLIntegralImage::CLIntegralImage()
-    : _integral_hor(), _integral_vert()
+    : _integral_hor(support::cpp14::make_unique<CLIntegralImageHorKernel>()),
+      _integral_vert(support::cpp14::make_unique<CLIntegralImageVertKernel>())
 {
 }
 
+CLIntegralImage::~CLIntegralImage() = default;
+
 void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -40,12 +44,12 @@ void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output)
 
 void CLIntegralImage::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    _integral_hor.configure(compile_context, input, output);
-    _integral_vert.configure(compile_context, output);
+    _integral_hor->configure(compile_context, input, output);
+    _integral_vert->configure(compile_context, output);
 }
 
 void CLIntegralImage::run()
 {
-    CLScheduler::get().enqueue(_integral_hor, false);
-    CLScheduler::get().enqueue(_integral_vert);
+    CLScheduler::get().enqueue(*_integral_hor, false);
+    CLScheduler::get().enqueue(*_integral_vert);
 }
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 66191d1799..64aac269cd 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -24,12 +24,15 @@
 #include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -39,10 +42,15 @@ constexpr int max_input_tensor_dim = 3;
 } // namespace
 
 CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
+    : _memory_group(std::move(memory_manager)),
+      _reduce_func(),
+      _normalize_kernel(support::cpp14::make_unique<CLL2NormalizeLayerKernel>()),
+      _sumsq()
 {
 }
 
+CLL2NormalizeLayer::~CLL2NormalizeLayer() = default;
+
 void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis, float epsilon)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon);
@@ -59,7 +67,7 @@ void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLT
     // Configure kernels
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
     _reduce_func.configure(compile_context, input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE);
-    _normalize_kernel.configure(compile_context, input, &_sumsq, output, axis, epsilon);
+    _normalize_kernel->configure(compile_context, input, &_sumsq, output, axis, epsilon);
 
     // Allocate intermediate tensor
     _sumsq.allocator()->allocate();
@@ -91,6 +99,6 @@ void CLL2NormalizeLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     _reduce_func.run();
-    CLScheduler::get().enqueue(_normalize_kernel, true);
+    CLScheduler::get().enqueue(*_normalize_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 058b6027c2..b095c06535 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -29,6 +29,22 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -37,20 +53,23 @@ using namespace arm_compute::utils::info_helpers;
 
 CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
-      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
-      _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(),
-      _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(),
-      _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
-      _ones_memset_kernel(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(),
-      _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(),
-      _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(),
-      _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(),
-      _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(),
-      _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false),
-      _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false)
+      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(),
+      _transpose_cell_state(support::cpp14::make_unique<CLTransposeKernel>()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(),
+      _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(),
+      _fully_connected_output_state(), _projection_clip(), _copy_cell_state(support::cpp14::make_unique<CLCopyKernel>()), _copy_output(support::cpp14::make_unique<CLCopyKernel>()), _concat_scratch_buffer(),
+      _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(), _ones_memset_kernel(support::cpp14::make_unique<CLMemsetKernel>()),
+      _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(),
+      _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(),
+      _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(),
+      _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(),
+      _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(),
+      _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false),
+      _is_layer_norm_lstm(false)
 {
 }
 
+CLLSTMLayer::~CLLSTMLayer() = default;
+
 void CLLSTMLayer::configure(const ICLTensor *input,
                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
@@ -172,7 +191,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
-        _ones_memset_kernel.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
+        _ones_memset_kernel->configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
         _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
@@ -241,7 +260,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     _memory_group.manage(&_cell_state_out1);
     _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
-    _transpose_cell_state.configure(compile_context, recurrent_to_cell_weights, &_cell_state_out2);
+    _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights, &_cell_state_out2);
     _memory_group.manage(&_cell_state_out3);
     _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
     _cell_state_out2.allocator()->allocate();
@@ -367,8 +386,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     }
 
     // Copy cell state and output
-    _copy_cell_state.configure(compile_context, &_cell_state_out1, cell_state_out);
-    _copy_output.configure(compile_context, output_state_out, output);
+    _copy_cell_state->configure(compile_context, &_cell_state_out1, cell_state_out);
+    _copy_output->configure(compile_context, output_state_out, output);
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<const ICLTensor *> scratch_inputs;
@@ -642,7 +661,7 @@ void CLLSTMLayer::run()
 
     if(_run_cifg_opt)
     {
-        CLScheduler::get().enqueue(_ones_memset_kernel);
+        CLScheduler::get().enqueue(*_ones_memset_kernel);
         _subtract_input_gate.run();
     }
     else
@@ -665,7 +684,7 @@ void CLLSTMLayer::run()
     }
 
     _fully_connected_cell_state.run();
-    CLScheduler::get().enqueue(_transpose_cell_state);
+    CLScheduler::get().enqueue(*_transpose_cell_state);
     _gemm_cell_state1.run();
     _accum_cell_state1.run();
     if(_is_layer_norm_lstm)
@@ -711,8 +730,8 @@ void CLLSTMLayer::run()
         }
     }
 
-    CLScheduler::get().enqueue(_copy_cell_state);
-    CLScheduler::get().enqueue(_copy_output);
+    CLScheduler::get().enqueue(*_copy_cell_state);
+    CLScheduler::get().enqueue(*_copy_output);
 
     _concat_scratch_buffer.run();
 }
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index e30b1dbb86..46062387e7 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -27,6 +27,15 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
 
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 81e903cde8..1ad19e56ea 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -32,6 +32,9 @@
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "src/core/CL/kernels/CLGaussianPyramidKernel.h"
 
 using namespace arm_compute;
 
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index cbb952c3f6..d7fd81754b 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -23,11 +23,13 @@
  */
 #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include <cstddef>
 
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index d501985aef..04e59ac4a6 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -27,6 +27,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLCol2ImKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "support/MemorySupport.h"
 
 #include <cmath>
 #include <tuple>
@@ -78,8 +83,16 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons
 } // namespace
 
 CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_prepared(false), _original_weights(nullptr)
+    : _memory_group(std::move(memory_manager)),
+      _input_im2col_kernel(support::cpp14::make_unique<CLIm2ColKernel>()),
+      _weights_reshape_kernel(support::cpp14::make_unique<CLWeightsReshapeKernel>()),
+      _mm_kernel(support::cpp14::make_unique<CLLocallyConnectedMatrixMultiplyKernel>()),
+      _output_col2im_kernel(support::cpp14::make_unique<CLCol2ImKernel>()),
+      _input_im2col_reshaped(),
+      _weights_reshaped(),
+      _gemm_output(),
+      _is_prepared(false),
+      _original_weights(nullptr)
 {
 }
 
@@ -127,10 +140,13 @@ Status CLLocallyConnectedLayer::validate(const ITensorInfo *input, const ITensor
     return Status{};
 }
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info);
 }
+#pragma GCC diagnostic pop
 
 void CLLocallyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
                                         const PadStrideInfo &conv_info)
@@ -166,16 +182,16 @@ void CLLocallyConnectedLayer::configure(const CLCompileContext &compile_context,
     _memory_group.manage(&_gemm_output);
 
     // Configure kernels
-    _input_im2col_kernel.configure(compile_context, input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
-    _weights_reshape_kernel.configure(compile_context, weights, biases, &_weights_reshaped);
-    _mm_kernel.configure(compile_context, &_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
-    _output_col2im_kernel.configure(compile_context, &_gemm_output, output, Size2D(conv_w, conv_h));
+    _input_im2col_kernel->configure(compile_context, input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
+    _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped);
+    _mm_kernel->configure(compile_context, &_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im_kernel->configure(compile_context, &_gemm_output, output, Size2D(conv_w, conv_h));
 
     // Allocate intermediate tensors
     _input_im2col_reshaped.allocator()->allocate();
     _gemm_output.allocator()->allocate();
 
-    CLScheduler::get().tune_kernel_static(_input_im2col_kernel);
+    CLScheduler::get().tune_kernel_static(*_input_im2col_kernel);
 }
 
 void CLLocallyConnectedLayer::run()
@@ -185,13 +201,13 @@ void CLLocallyConnectedLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run input reshaping
-    CLScheduler::get().enqueue(_input_im2col_kernel);
+    CLScheduler::get().enqueue(*_input_im2col_kernel);
 
     // Runs vector matrix multiply on reshaped matrices
-    CLScheduler::get().enqueue(_mm_kernel);
+    CLScheduler::get().enqueue(*_mm_kernel);
 
     // Reshape output matrix
-    CLScheduler::get().enqueue(_output_col2im_kernel, false);
+    CLScheduler::get().enqueue(*_output_col2im_kernel.get(), false);
 }
 
 void CLLocallyConnectedLayer::prepare()
@@ -202,7 +218,7 @@ void CLLocallyConnectedLayer::prepare()
 
         // Run weights reshaping and mark original weights tensor as unused
         _weights_reshaped.allocator()->allocate();
-        CLScheduler::get().enqueue(_weights_reshape_kernel);
+        CLScheduler::get().enqueue(*_weights_reshape_kernel);
         _original_weights->mark_as_unused();
 
         CLScheduler::get().queue().finish();
diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp
new file mode 100644
index 0000000000..55d3dc523b
--- /dev/null
+++ b/src/runtime/CL/functions/CLLogicalAnd.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "support/MemorySupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace experimental
+{
+void CLLogicalAnd::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLLogicalBinaryKernel>();
+    k->configure(compile_context, kernels::LogicalOperation::And, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status CLLogicalAnd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLLogicalBinaryKernel::validate(kernels::LogicalOperation::And, input1, input2, output);
+}
+
+void CLLogicalAnd::run(ITensorPack &tensors)
+{
+    ICLOperator::run(tensors);
+}
+} // namespace experimental
+
+struct CLLogicalAnd::Impl
+{
+    const ICLTensor                            *src0{ nullptr };
+    const ICLTensor                            *src1{ nullptr };
+    ICLTensor                                  *dst{ nullptr };
+    std::unique_ptr<experimental::CLLogicalAnd> op{ nullptr };
+};
+
+CLLogicalAnd::CLLogicalAnd()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default;
+CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default;
+CLLogicalAnd::~CLLogicalAnd()                          = default;
+
+void CLLogicalAnd::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+void CLLogicalAnd::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    _impl->src0 = input1;
+    _impl->src1 = input2;
+    _impl->dst  = output;
+    _impl->op   = arm_compute::support::cpp14::make_unique<experimental::CLLogicalAnd>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info());
+}
+
+Status CLLogicalAnd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return experimental::CLLogicalAnd::validate(input1, input2, output);
+}
+
+void CLLogicalAnd::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp
new file mode 100644
index 0000000000..67aa3192f8
--- /dev/null
+++ b/src/runtime/CL/functions/CLLogicalNot.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLogicalNot.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
+#include "support/MemorySupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace experimental
+{
+void CLLogicalNot::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(compile_context, input, output, ElementWiseUnary::LOGICAL_NOT);
+    _kernel = std::move(k);
+}
+
+Status CLLogicalNot::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::LOGICAL_NOT);
+}
+
+void CLLogicalNot::run(ITensorPack &tensors)
+{
+    ICLOperator::run(tensors);
+}
+} // namespace experimental
+
+struct CLLogicalNot::Impl
+{
+    const ICLTensor                            *src{ nullptr };
+    ICLTensor                                  *dst{ nullptr };
+    std::unique_ptr<experimental::CLLogicalNot> op{ nullptr };
+};
+
+CLLogicalNot::CLLogicalNot()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default;
+CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default;
+CLLogicalNot::~CLLogicalNot()                          = default;
+
+void CLLogicalNot::configure(const ICLTensor *input, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output);
+}
+
+void CLLogicalNot::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::CLLogicalNot>();
+    _impl->op->configure(compile_context, input->info(), output->info());
+}
+
+Status CLLogicalNot::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return experimental::CLLogicalNot::validate(input, output);
+}
+
+void CLLogicalNot::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp
new file mode 100644
index 0000000000..4681083fd5
--- /dev/null
+++ b/src/runtime/CL/functions/CLLogicalOr.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "support/MemorySupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace experimental
+{
+void CLLogicalOr::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLLogicalBinaryKernel>();
+    k->configure(compile_context, kernels::LogicalOperation::Or, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status CLLogicalOr::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return CLLogicalBinaryKernel::validate(kernels::LogicalOperation::Or, input1, input2, output);
+}
+
+void CLLogicalOr::run(ITensorPack &tensors)
+{
+    ICLOperator::run(tensors);
+}
+} /* namespace experimental */
+
+struct CLLogicalOr::Impl
+{
+    const ICLTensor                           *src0{ nullptr };
+    const ICLTensor                           *src1{ nullptr };
+    ICLTensor                                 *dst{ nullptr };
+    std::unique_ptr<experimental::CLLogicalOr> op{ nullptr };
+};
+
+CLLogicalOr::CLLogicalOr()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default;
+CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default;
+CLLogicalOr::~CLLogicalOr()                         = default;
+
+void CLLogicalOr::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
+}
+
+void CLLogicalOr::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+    _impl->src0 = input1;
+    _impl->src1 = input2;
+    _impl->dst  = output;
+    _impl->op   = arm_compute::support::cpp14::make_unique<experimental::CLLogicalOr>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info());
+}
+
+Status CLLogicalOr::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return experimental::CLLogicalOr::validate(input1, input2, output);
+}
+
+void CLLogicalOr::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
index 962adadbb2..fb3ebdaa96 100644
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ b/src/runtime/CL/functions/CLMagnitude.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLMagnitude.h"
 
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
index 3e32c55067..392bff2b4e 100644
--- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
@@ -24,18 +24,23 @@
 #include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLMaxUnpoolingLayer::CLMaxUnpoolingLayer()
-    : _memset_kernel(), _unpooling_layer_kernel()
+    : _memset_kernel(support::cpp14::make_unique<CLMemsetKernel>()),
+      _unpooling_layer_kernel(support::cpp14::make_unique<CLMaxUnpoolingLayerKernel>())
 {
 }
 
+CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default;
+
 void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info);
@@ -44,9 +49,9 @@ void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTen
 void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
 {
     const PixelValue zero_value(0.f);
-    _memset_kernel.configure(output, zero_value);
+    _memset_kernel->configure(output, zero_value);
 
-    _unpooling_layer_kernel.configure(compile_context, input, indices, output, pool_info);
+    _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info);
 }
 
 Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
@@ -57,9 +62,9 @@ Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo
 void CLMaxUnpoolingLayer::run()
 {
     // Run memset
-    CLScheduler::get().enqueue(_memset_kernel, false);
+    CLScheduler::get().enqueue(*_memset_kernel, false);
 
     // Run max unpooling layer
-    CLScheduler::get().enqueue(_unpooling_layer_kernel);
+    CLScheduler::get().enqueue(*_unpooling_layer_kernel);
 }
 } /* namespace arm_compute */
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index 2517fdc4ef..c91bc954b8 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -25,6 +25,10 @@
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
@@ -39,13 +43,15 @@ CLMeanStdDev::CLMeanStdDev(std::shared_ptr<IMemoryManager> memory_manager) // NO
       _reduction_output_stddev(),
       _mean(nullptr),
       _stddev(nullptr),
-      _mean_stddev_kernel(),
-      _fill_border_kernel(),
+      _mean_stddev_kernel(support::cpp14::make_unique<CLMeanStdDevKernel>()),
+      _fill_border_kernel(support::cpp14::make_unique<CLFillBorderKernel>()),
       _global_sum(),
       _global_sum_squared()
 {
 }
 
+CLMeanStdDev::~CLMeanStdDev() = default;
+
 Status CLMeanStdDev::validate(ITensorInfo *input, float *mean, float *stddev)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input);
@@ -101,8 +107,8 @@ void CLMeanStdDev::configure(const CLCompileContext &compile_context, ICLImage *
             _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
         }
 
-        _mean_stddev_kernel.configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared);
-        _fill_border_kernel.configure(compile_context, input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
+        _mean_stddev_kernel->configure(compile_context, input, mean, &_global_sum, stddev, &_global_sum_squared);
+        _fill_border_kernel->configure(compile_context, input, _mean_stddev_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
     }
 }
 
@@ -149,8 +155,8 @@ void CLMeanStdDev::run_float()
 
 void CLMeanStdDev::run_int()
 {
-    CLScheduler::get().enqueue(_fill_border_kernel);
-    CLScheduler::get().enqueue(_mean_stddev_kernel);
+    CLScheduler::get().enqueue(*_fill_border_kernel);
+    CLScheduler::get().enqueue(*_mean_stddev_kernel);
 }
 
 void CLMeanStdDev::run()
diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index 07ab669fde..5b5ff49ecb 100644
--- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
index 92153128f9..2040ebd4f5 100644
--- a/src/runtime/CL/functions/CLMedian3x3.cpp
+++ b/src/runtime/CL/functions/CLMedian3x3.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
 
-#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLMedian3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLMedian3x3::configure(const CLCompileContext &compile_context, ICLTensor *
     auto k = arm_compute::support::cpp14::make_unique<CLMedian3x3Kernel>();
     k->configure(compile_context, input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
index a27defe2f7..3ddd4d04ed 100644
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp
@@ -22,14 +22,15 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
-
 #include "arm_compute/core/CL/CLHelpers.h"
+#include "src/core/CL/kernels/CLMinMaxLocationKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLMinMaxLocation::CLMinMaxLocation()
-    : _min_max_kernel(),
-      _min_max_loc_kernel(),
+    : _min_max_kernel(support::cpp14::make_unique<CLMinMaxKernel>()),
+      _min_max_loc_kernel(support::cpp14::make_unique<CLMinMaxLocationKernel>()),
       _min_max_vals(),
       _min_max_count_vals(),
       _min(nullptr),
@@ -41,6 +42,8 @@ CLMinMaxLocation::CLMinMaxLocation()
 {
 }
 
+CLMinMaxLocation::~CLMinMaxLocation() = default;
+
 void CLMinMaxLocation::configure(const ICLImage *input, void *min, void *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, min, max, min_loc, max_loc, min_count, max_count);
@@ -62,16 +65,16 @@ void CLMinMaxLocation::configure(const CLCompileContext &compile_context, const
     _min_loc            = min_loc;
     _max_loc            = max_loc;
 
-    _min_max_kernel.configure(compile_context, input, &_min_max_vals);
-    _min_max_loc_kernel.configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
+    _min_max_kernel->configure(compile_context, input, &_min_max_vals);
+    _min_max_loc_kernel->configure(compile_context, input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
 }
 
 void CLMinMaxLocation::run()
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
-    CLScheduler::get().enqueue(_min_max_kernel, false);
-    CLScheduler::get().enqueue(_min_max_loc_kernel, false);
+    CLScheduler::get().enqueue(*_min_max_kernel, false);
+    CLScheduler::get().enqueue(*_min_max_loc_kernel, false);
 
     // Update min and max
     q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), static_cast<int32_t *>(_min));
diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
index 71f08e8072..3312f6f9a7 100644
--- a/src/runtime/CL/functions/CLNonLinearFilter.cpp
+++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp
@@ -23,7 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
 
-#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLNonLinearFilterKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -42,5 +43,5 @@ void CLNonLinearFilter::configure(const CLCompileContext &compile_context, ICLTe
     auto k = arm_compute::support::cpp14::make_unique<CLNonLinearFilterKernel>();
     k->configure(compile_context, input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
index a79bb0c5a3..22ca176a71 100644
--- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
+++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
@@ -23,7 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
 
-#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -43,10 +44,10 @@ void CLNonMaximaSuppression3x3::configure(const CLCompileContext &compile_contex
 
     if(border_mode != BorderMode::UNDEFINED)
     {
-        _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT);
+        _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT);
     }
     else
     {
-        _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED);
+        _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::UNDEFINED);
     }
 }
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index 4be6257bbf..40a6cdd2f4 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -25,18 +25,25 @@
 #include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLNormalizationLayer::CLNormalizationLayer()
-    : _norm_kernel(), _border_handler()
+    : _norm_kernel(support::cpp14::make_unique<CLNormalizationLayerKernel>()),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>())
 {
 }
 
+CLNormalizationLayer::~CLNormalizationLayer() = default;
+
 void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
@@ -47,10 +54,10 @@ void CLNormalizationLayer::configure(const CLCompileContext &compile_context, IC
     ARM_COMPUTE_ERROR_ON(input == nullptr);
 
     // Configure normalization kernel
-    _norm_kernel.configure(compile_context, input, output, norm_info);
+    _norm_kernel->configure(compile_context, input, output, norm_info);
 
     // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(compile_context, input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
+    _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 }
 
 Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
@@ -61,8 +68,8 @@ Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInf
 void CLNormalizationLayer::run()
 {
     // Run border handler
-    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(*_border_handler, false);
 
     // Run normalization kernel
-    CLScheduler::get().enqueue(_norm_kernel);
+    CLScheduler::get().enqueue(*_norm_kernel);
 }
diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index 806e6489a2..9576486db0 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
@@ -24,7 +24,7 @@
 
 #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index 0b5547eaab..fca6192296 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
@@ -33,6 +32,8 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLLKTrackerKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
@@ -42,7 +43,7 @@ CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) //
       _tracker_init_kernel(),
       _tracker_stage0_kernel(),
       _tracker_stage1_kernel(),
-      _tracker_finalize_kernel(),
+      _tracker_finalize_kernel(support::cpp14::make_unique<CLLKTrackerFinalizeKernel>()),
       _func_scharr(),
       _scharr_gx(),
       _scharr_gy(),
@@ -57,6 +58,8 @@ CLOpticalFlow::CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) //
 {
 }
 
+CLOpticalFlow::~CLOpticalFlow() = default;
+
 void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
                               const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
                               Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
@@ -93,9 +96,9 @@ void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLP
     const int   old_values_list_length = list_length * window_dimension * window_dimension;
 
     // Create kernels and tensors
-    _tracker_init_kernel.resize(_num_levels);
-    _tracker_stage0_kernel.resize(_num_levels);
-    _tracker_stage1_kernel.resize(_num_levels);
+    _tracker_init_kernel.reserve(_num_levels);
+    _tracker_stage0_kernel.reserve(_num_levels);
+    _tracker_stage1_kernel.reserve(_num_levels);
     _func_scharr.resize(_num_levels);
     _scharr_gx.resize(_num_levels);
     _scharr_gy.resize(_num_levels);
@@ -134,16 +137,19 @@ void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLP
         _func_scharr[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
 
         // Init Lucas-Kanade init kernel
-        _tracker_init_kernel[i].configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
+        _tracker_init_kernel.emplace_back(support::cpp14::make_unique<CLLKTrackerInitKernel>());
+        _tracker_init_kernel.back()->configure(compile_context, old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
 
         // Init Lucas-Kanade stage0 kernel
-        _tracker_stage0_kernel[i].configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
-                                            _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
-                                            window_dimension, i);
+        _tracker_stage0_kernel.emplace_back(support::cpp14::make_unique<CLLKTrackerStage0Kernel>());
+        _tracker_stage0_kernel.back()->configure(compile_context, old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
+                                                 _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
+                                                 window_dimension, i);
 
         // Init Lucas-Kanade stage1 kernel
-        _tracker_stage1_kernel[i].configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
-                                            termination, epsilon, num_iterations, window_dimension, i);
+        _tracker_stage1_kernel.emplace_back(support::cpp14::make_unique<CLLKTrackerStage1Kernel>());
+        _tracker_stage1_kernel.back()->configure(compile_context, new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
+                                                 termination, epsilon, num_iterations, window_dimension, i);
 
         // Allocate intermediate buffers
         _scharr_gx[i].allocator()->allocate();
@@ -151,7 +157,7 @@ void CLOpticalFlow::configure(const CLCompileContext &compile_context, const CLP
     }
 
     // Finalize Lucas-Kanade
-    _tracker_finalize_kernel.configure(compile_context, _new_points_internal.get(), new_points);
+    _tracker_finalize_kernel->configure(compile_context, _new_points_internal.get(), new_points);
 }
 
 void CLOpticalFlow::run()
@@ -166,14 +172,14 @@ void CLOpticalFlow::run()
         _func_scharr[level - 1].run();
 
         // Run Lucas-Kanade init kernel
-        CLScheduler::get().enqueue(_tracker_init_kernel[level - 1]);
+        CLScheduler::get().enqueue(*_tracker_init_kernel[level - 1]);
 
         // Run Lucas-Kanade stage0 kernel
-        CLScheduler::get().enqueue(_tracker_stage0_kernel[level - 1]);
+        CLScheduler::get().enqueue(*_tracker_stage0_kernel[level - 1]);
 
         // Run Lucas-Kanade stage1 kernel
-        CLScheduler::get().enqueue(_tracker_stage1_kernel[level - 1]);
+        CLScheduler::get().enqueue(*_tracker_stage1_kernel[level - 1]);
     }
 
-    CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
+    CLScheduler::get().enqueue(*_tracker_finalize_kernel, true);
 }
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index e03bd13284..60cf4d1a2d 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "src/core/CL/kernels/CLElementwiseOperationKernel.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -30,43 +30,9 @@
 
 namespace arm_compute
 {
-namespace
-{
-void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ITensorInfo *input1, ITensorInfo *input2, const ITensorInfo *output)
-{
-    if(output->dimension(0) > 1)
-    {
-        ITensorInfo *broadcasted_info = (input1->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->dimension(0) == 1)
-        {
-            border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE);
-        }
-    }
-}
-
-ITensorPack select_border_input(ITensorPack &tensors)
-{
-    ITensorPack pack;
-    if(tensors.get_tensor(TensorType::ACL_DST)->info()->dimension(0) > 1)
-    {
-        if(tensors.get_const_tensor(TensorType::ACL_SRC_1)->info()->dimension(0) == 1)
-        {
-            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1));
-        }
-        else
-        {
-            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_0));
-        }
-    }
-    return pack;
-}
-} // namespace
-
 namespace experimental
 {
 CLPReluLayer::CLPReluLayer()
-    : _border_handler()
 {
 }
 
@@ -75,7 +41,6 @@ void CLPReluLayer::configure(const CLCompileContext &compile_context, ITensorInf
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, output);
     _kernel = std::move(k);
-    configure_border_handler(compile_context, _border_handler, _kernel->border_size(), input, alpha, output);
 }
 
 Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
@@ -85,8 +50,6 @@ Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha
 
 void CLPReluLayer::run(ITensorPack &tensors)
 {
-    auto border_pack = select_border_input(tensors);
-    CLScheduler::get().enqueue_op(_border_handler, border_pack);
     ICLOperator::run(tensors);
 }
 } // namespace experimental
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 12a51f11f5..388b07b76e 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -22,14 +22,21 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLPadLayer::CLPadLayer()
-    : _pad_kernel(), _copy_kernel(), _perform_pad(false)
+    : _pad_kernel(support::cpp14::make_unique<CLPadLayerKernel>()),
+      _copy_kernel(support::cpp14::make_unique<CLCopyKernel>()),
+      _perform_pad(false)
 {
 }
 
+CLPadLayer::~CLPadLayer() = default;
+
 void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
@@ -46,12 +53,12 @@ void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *i
 
     if(_perform_pad)
     {
-        _pad_kernel.configure(compile_context, input, output, padding, constant_value, mode);
+        _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
     }
     else
     {
         // Copy the input to the whole output if no padding is applied
-        _copy_kernel.configure(compile_context, input, output);
+        _copy_kernel->configure(compile_context, input, output);
     }
 }
 Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
@@ -67,9 +74,7 @@ Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
     }
     else
     {
-        Window copy_window = Window();
-        copy_window.use_tensor_dimensions(output->tensor_shape());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), &copy_window));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output));
     }
     return Status{};
 }
@@ -77,11 +82,11 @@ void CLPadLayer::run()
 {
     if(_perform_pad)
     {
-        CLScheduler::get().enqueue(_pad_kernel);
+        CLScheduler::get().enqueue(*_pad_kernel);
     }
     else
     {
-        CLScheduler::get().enqueue(_copy_kernel);
+        CLScheduler::get().enqueue(*_copy_kernel);
     }
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index e13046bd46..f7f0bc4f5d 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
 #include "arm_compute/core/Error.h"
+#include "src/core/CL/kernels/CLPermuteKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
index 64d2e0fdff..6594cd5bac 100644
--- a/src/runtime/CL/functions/CLPhase.cpp
+++ b/src/runtime/CL/functions/CLPhase.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLPhase.h"
 
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "src/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 883ce68536..12cc5d60af 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -55,7 +56,7 @@ ITensorPack select_border_input(ITensorPack &tensors)
 namespace experimental
 {
 CLPixelWiseMultiplication::CLPixelWiseMultiplication()
-    : _border_handler()
+    : _border_handler(support::cpp14::make_unique<CLFillBorderKernel>())
 {
 }
 
@@ -72,7 +73,7 @@ void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_contex
 
         if(broadcasted_info->dimension(0) == 1)
         {
-            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
     }
 }
@@ -86,12 +87,12 @@ Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITen
 void CLPixelWiseMultiplication::run(ITensorPack &tensors)
 {
     auto border_pack = select_border_input(tensors);
-    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    CLScheduler::get().enqueue_op(*_border_handler, border_pack);
     ICLOperator::run(tensors);
 }
 
 CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication()
-    : _border_handler()
+    : _border_handler(support::cpp14::make_unique<CLFillBorderKernel>())
 {
 }
 
@@ -107,7 +108,7 @@ void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile
 
         if(broadcasted_info->dimension(0) == 1)
         {
-            _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
     }
 }
@@ -120,7 +121,7 @@ Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, con
 void CLComplexPixelWiseMultiplication::run(ITensorPack &tensors)
 {
     auto border_pack = select_border_input(tensors);
-    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    CLScheduler::get().enqueue_op(*_border_handler, border_pack);
     ICLOperator::run(tensors);
 }
 } // namespace experimental
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index a14818fffe..7f99aee9ba 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPoolingLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -79,7 +80,7 @@ void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTenso
         default:
             ARM_COMPUTE_ERROR("Data layout not supported");
     }
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, pixel_value);
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, pixel_value);
 
     // Tune kernels
     CLScheduler::get().tune_kernel_static(*_kernel);
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index 1907c7cc08..8cb971793e 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -24,12 +24,14 @@
 
 #include "arm_compute/runtime/CL/functions/CLPriorBoxLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index a40a5d068d..54df5a0a5e 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -30,6 +30,18 @@
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -85,10 +97,50 @@ void CLQLSTMLayer::TensorCopyKernel::run()
 }
 
 CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _input_to_input_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _recurrent_to_input_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _input_to_forget_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _recurrent_to_forget_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _input_to_cell_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _recurrent_to_cell_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _input_to_output_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _recurrent_to_output_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _projection_reduction(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
+      _layer_norms(),
+      _copy_output(support::cpp14::make_unique<CLCopyKernel>())
 {
+    for(auto &norm : _layer_norms)
+    {
+        norm = support::cpp14::make_unique<CLQLSTMLayerNormalizationKernel>();
+    }
+
     _memory_group = MemoryGroup(std::move(memory_manager));
 }
 
+CLQLSTMLayer::~CLQLSTMLayer() = default;
+
+void CLQLSTMLayer::configure_layer_norm(LayerNormGate g, const ICLTensor *in)
+{
+    ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
+
+    CLTensor *out = &get_layer_norm_output(g);
+    _memory_group.manage(out);
+    out->allocator()->init(*(in->info()));
+
+    get_layer_norm(g).configure(in, out, get_layer_norm_weight(g), get_layer_norm_bias(g));
+}
+
+Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
+{
+    // Output quantization scale will be different, but ignored here
+    // since it will be configured at configure() stage.
+    const TensorInfo out
+    {
+        in
+    };
+    return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
+}
+
 void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
                                 const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,
                                 CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,
@@ -113,7 +165,7 @@ void CLQLSTMLayer::configure(const ICLTensor *input,
                              const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                              const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                              const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                             ICLTensor *cell_state_in, ICLTensor *output_state_in,
                              ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
@@ -126,7 +178,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
                              const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                              const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                              const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                             ICLTensor *cell_state_in, ICLTensor *output_state_in,
                              ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
@@ -199,18 +251,18 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction.configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction.configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+        _input_to_input_reduction->configure(compile_context, _input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
-    _input_to_forget_reduction.configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_cell_reduction.configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_output_reduction.configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(compile_context, input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     if(_has_projection)
     {
-        _projection_reduction.configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        _projection_reduction->configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
         if(_projection_bias != nullptr)
         {
             _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
@@ -504,9 +556,9 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
         if(_projection_tensor_copy_required)
         {
             _hidden_gate.allocator()->allocate();
-            _projection_accumulate_res.allocator()->init(*output_state_out->info());
+            _projection_accumulate_res.allocator()->init(*output_state_in->info());
             _projection_accumulate_res.info()->set_tensor_shape(_projection_outstage_res.info()->tensor_shape());
-            _projection_output_to_accumulate_copy.configure(*output_state_out, _projection_accumulate_res);
+            _projection_output_to_accumulate_copy.configure(*output_state_in, _projection_accumulate_res);
             accumulate_destination = &_projection_accumulate_res;
         }
 
@@ -542,7 +594,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     }
 
     // Copy output_state_out to output
-    _copy_output.configure(compile_context, output_state_out, output);
+    _copy_output->configure(compile_context, output_state_out, output);
 }
 
 Status CLQLSTMLayer::validate(const ITensorInfo *input,
@@ -833,7 +885,8 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
     ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
-    gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
+    gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
+    gemmlowp_info.output_data_type = hidden_out_info.data_type();
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
 
     const bool projection_tensor_copy_required = num_units != output_size;
@@ -863,7 +916,7 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
 
         if(projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_out, projection_outstage_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
         }
 
         ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
@@ -1047,7 +1100,7 @@ void CLQLSTMLayer::run()
     }
 
     // Copy output_state_out to output
-    CLScheduler::get().enqueue(_copy_output);
+    CLScheduler::get().enqueue(*_copy_output);
 }
 
 void CLQLSTMLayer::prepare()
@@ -1079,8 +1132,8 @@ void CLQLSTMLayer::prepare()
         {
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
-            CLScheduler::get().enqueue(_input_to_input_reduction);
-            CLScheduler::get().enqueue(_recurrent_to_input_reduction);
+            CLScheduler::get().enqueue(*_input_to_input_reduction);
+            CLScheduler::get().enqueue(*_recurrent_to_input_reduction);
 
             _input_to_input_weights_transposed.allocator()->allocate();
             _recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1095,17 +1148,17 @@ void CLQLSTMLayer::prepare()
         _recurrent_to_cell_eff_bias.allocator()->allocate();
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
-        CLScheduler::get().enqueue(_input_to_forget_reduction);
-        CLScheduler::get().enqueue(_recurrent_to_forget_reduction);
-        CLScheduler::get().enqueue(_input_to_cell_reduction);
-        CLScheduler::get().enqueue(_recurrent_to_cell_reduction);
-        CLScheduler::get().enqueue(_input_to_output_reduction);
-        CLScheduler::get().enqueue(_recurrent_to_output_reduction);
+        CLScheduler::get().enqueue(*_input_to_forget_reduction);
+        CLScheduler::get().enqueue(*_recurrent_to_forget_reduction);
+        CLScheduler::get().enqueue(*_input_to_cell_reduction);
+        CLScheduler::get().enqueue(*_recurrent_to_cell_reduction);
+        CLScheduler::get().enqueue(*_input_to_output_reduction);
+        CLScheduler::get().enqueue(*_recurrent_to_output_reduction);
 
         if(_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            CLScheduler::get().enqueue(_projection_reduction);
+            CLScheduler::get().enqueue(*_projection_reduction);
             if(_projection_bias != nullptr)
             {
                 _projection_bias_add.run();
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index f0a446acab..f132547eb9 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
+#include "src/core/CL/kernels/CLQuantizationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 94e7f9440c..be3e539f98 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -28,17 +28,33 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLCopyKernel.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 
 CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
-      _is_prepared(false)
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy_kernel(support::cpp14::make_unique<CLCopyKernel>()), _fully_connected_out(),
+      _gemm_output(), _add_output(), _is_prepared(false)
 {
 }
 
+CLRNNLayer::~CLRNNLayer() = default;
+
 Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
                             const ITensorInfo *output, const ActivationLayerInfo &info)
 {
@@ -107,7 +123,7 @@ void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTen
     _activation.configure(compile_context, &_add_output, hidden_state, info);
     _add_output.allocator()->allocate();
 
-    _copy_kernel.configure(compile_context, hidden_state, output);
+    _copy_kernel->configure(compile_context, hidden_state, output);
 }
 
 void CLRNNLayer::run()
@@ -122,7 +138,7 @@ void CLRNNLayer::run()
     _activation.run();
 
     // copy hidden out to output
-    CLScheduler::get().enqueue(_copy_kernel);
+    CLScheduler::get().enqueue(*_copy_kernel);
 }
 
 void CLRNNLayer::prepare()
diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index 2337cee33f..cf28a1a0fb 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp
@@ -24,7 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
 
 #include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index cdf60ce04f..b0e6716cce 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -22,10 +22,8 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
-
 #include "arm_compute/core/CL/ICLArray.h"
-
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
index 8bf2a0c43e..57b57bd305 100644
--- a/src/runtime/CL/functions/CLRange.cpp
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -24,10 +24,10 @@
 #include "arm_compute/runtime/CL/functions/CLRange.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index c8eb542c69..b761dc2f99 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/runtime/CL/functions/CLReduceMean.h"
 
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
@@ -83,15 +85,25 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
         }
         const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+        const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
+        if(requant)
+        {
+            TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32));
+            CLDequantizationLayer::validate(input, &input_no_quant);
+            TensorInfo output_no_quant(output->clone()->set_data_type(DataType::F32));
+            CLQuantizationLayer::validate(&output_no_quant, output);
+        }
     }
     return Status{};
 }
 }
+
 CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(),
+      _output_no_quant()
 {
 }
+
 void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output);
@@ -102,33 +114,49 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
+    _do_requant    = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info();
     _reduction_ops = reduction_axis.num_dimensions();
     _reduction_kernels.resize(_reduction_ops);
     _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
     _keep_dims = keep_dims;
 
+    ICLTensor *tmp_input  = input;
+    ICLTensor *tmp_output = output;
+    if(_do_requant)
+    {
+        _memory_group.manage(&_input_no_quant);
+        _memory_group.manage(&_output_no_quant);
+        TensorInfo output_no_quant_info = input->info()->clone()->set_tensor_shape(output_shape);
+        output_no_quant_info.set_data_type(DataType::F32);
+        auto_init_if_empty(*_output_no_quant.info(), output_no_quant_info);
+        auto_init_if_empty(*_input_no_quant.info(), input->info()->clone()->set_data_type(DataType::F32));
+        _dequant.configure(compile_context, input, &_input_no_quant);
+        tmp_input  = &_input_no_quant;
+        tmp_output = &_output_no_quant;
+    }
+
     Coordinates axis_local = reduction_axis;
-    const int   input_dims = input->info()->num_dimensions();
+    const int   input_dims = tmp_input->info()->num_dimensions();
 
     convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
     for(int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+        TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
-        auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+        auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
 
         if(i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(compile_context, in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info()));
             _memory_group.manage(&_reduced_outs[i]);
             _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
         }
@@ -141,9 +169,9 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
     }
 
     // Configure reshape layer if we want to drop the dimensions
-    if(!keep_dims)
+    if(!_keep_dims)
     {
-        TensorShape out_shape = input->info()->tensor_shape();
+        TensorShape out_shape = tmp_input->info()->tensor_shape();
 
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
@@ -152,8 +180,14 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
         {
             out_shape.remove_dimension(axis_local[i] - i);
         }
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-        _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], output);
+        auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
+        _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output);
+    }
+    if(_do_requant)
+    {
+        _requant.configure(compile_context, &_output_no_quant, output);
+        _input_no_quant.allocator()->allocate();
+        _output_no_quant.allocator()->allocate();
     }
 }
 
@@ -166,14 +200,21 @@ void CLReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
+    if(_do_requant)
+    {
+        _dequant.run();
+    }
     for(auto &kernel : _reduction_kernels)
     {
         kernel.run();
     }
-
     if(!_keep_dims)
     {
         _reshape.run();
     }
+    if(_do_requant)
+    {
+        _requant.run();
+    }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 54e91fb8d8..7423f4bc87 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -30,7 +30,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/Utils.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/Utils.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -41,13 +44,15 @@ CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memor
 {
 }
 
+CLReductionOperation::~CLReductionOperation() = default;
+
 Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    const unsigned int num_of_stages       = calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+    const unsigned int num_of_stages       = utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
     const bool         is_serial           = needs_serialized_reduction(op, input->data_type(), axis);
     const bool         is_reshape_required = !keep_dims;
 
@@ -194,7 +199,7 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
 void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _num_of_stages       = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+    _num_of_stages       = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
     _reduction_axis      = axis;
     _is_serial           = needs_serialized_reduction(op, input->info()->data_type(), axis);
     _is_reshape_required = !keep_dims;
@@ -209,7 +214,7 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
     }
 
     // Configure reduction operation kernels
-    _reduction_kernels_vector.resize(_num_of_stages);
+    _reduction_kernels_vector.reserve(_num_of_stages);
 
     // Create temporary tensors
     if(_is_serial)
@@ -219,11 +224,12 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
             _memory_group.manage(&_results_vector.back());
         }
 
-        _reduction_kernels_vector[0].configure(compile_context, input, output_internal, axis, op, 0);
+        _reduction_kernels_vector.emplace_back(support::cpp14::make_unique<CLReductionOperationKernel>());
+        _reduction_kernels_vector[0]->configure(compile_context, input, output_internal, axis, op, 0);
     }
     else
     {
-        _border_handlers_vector.resize(_num_of_stages);
+        _border_handlers_vector.reserve(_num_of_stages);
         _memory_group.manage(&_results_vector[0]);
 
         ReductionOperation first_kernel_op;
@@ -267,15 +273,23 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
                 ARM_COMPUTE_ERROR("Not supported");
         }
 
-        _reduction_kernels_vector[0].configure(compile_context, input, &_results_vector[0], axis, first_kernel_op);
-        _border_handlers_vector[0].configure(compile_context, input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
+        _reduction_kernels_vector.emplace_back(support::cpp14::make_unique<CLReductionOperationKernel>());
+        _reduction_kernels_vector[0]->configure(compile_context, input, &_results_vector[0], axis, first_kernel_op);
+
+        _border_handlers_vector.emplace_back(support::cpp14::make_unique<CLFillBorderKernel>());
+        _border_handlers_vector[0]->configure(compile_context, input, _reduction_kernels_vector[0]->border_size(), BorderMode::CONSTANT, pixelValue);
 
         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
             _memory_group.manage(&_results_vector[i]);
-            _reduction_kernels_vector[i].configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
-            _border_handlers_vector[i].configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+
+            _reduction_kernels_vector.emplace_back(support::cpp14::make_unique<CLReductionOperationKernel>());
+            _reduction_kernels_vector[i]->configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
+
+            _border_handlers_vector.emplace_back(support::cpp14::make_unique<CLFillBorderKernel>());
+            _border_handlers_vector[i]->configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i]->border_size(), BorderMode::CONSTANT, pixelValue);
+
             _results_vector[i - 1].allocator()->allocate();
         }
 
@@ -288,8 +302,12 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
             _memory_group.manage(&_results_vector.back());
         }
 
-        _reduction_kernels_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
-        _border_handlers_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+        _reduction_kernels_vector.emplace_back(support::cpp14::make_unique<CLReductionOperationKernel>());
+        _reduction_kernels_vector[last_stage]->configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
+
+        _border_handlers_vector.emplace_back(support::cpp14::make_unique<CLFillBorderKernel>());
+        _border_handlers_vector[last_stage]->configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage]->border_size(), BorderMode::CONSTANT, pixelValue);
+
         _results_vector[last_stage - 1].allocator()->allocate();
     }
 
@@ -306,14 +324,14 @@ void CLReductionOperation::run()
 
     if(_is_serial)
     {
-        CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
+        CLScheduler::get().enqueue(*_reduction_kernels_vector[0], false);
     }
     else
     {
         for(unsigned int i = 0; i < _num_of_stages; ++i)
         {
-            CLScheduler::get().enqueue(_border_handlers_vector[i], false);
-            CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+            CLScheduler::get().enqueue(*_border_handlers_vector[i], false);
+            CLScheduler::get().enqueue(*_reduction_kernels_vector[i], false);
         }
     }
 
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
index 60b72c5f87..6466c2843b 100644
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ b/src/runtime/CL/functions/CLRemap.cpp
@@ -24,11 +24,12 @@
 #include "arm_compute/runtime/CL/functions/CLRemap.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLRemapKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -42,7 +43,7 @@ void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTenso
 
 void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy,
                         BorderMode border_mode,
-                        uint8_t constant_border_value)
+                        uint8_t    constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
@@ -53,5 +54,5 @@ void CLRemap::configure(const CLCompileContext &compile_context, ICLTensor *inpu
     auto k = arm_compute::support::cpp14::make_unique<CLRemapKernel>();
     k->configure(compile_context, input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
index 1dc41aefb5..4b2f70334f 100644
--- a/src/runtime/CL/functions/CLReorgLayer.cpp
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp
@@ -24,10 +24,10 @@
 #include "arm_compute/runtime/CL/functions/CLReorgLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index 273a761a0a..5112064b23 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+#include "src/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "support/MemorySupport.h"
 
 /** [CLReshapeLayer snippet] **/
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
index 213fbc8f32..b73d8de62e 100644
--- a/src/runtime/CL/functions/CLReverse.cpp
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 
-#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index e111c6d1f7..383b0cc305 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -24,10 +24,11 @@
 #include "arm_compute/runtime/CL/functions/CLScale.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLScaleKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -60,7 +61,7 @@ void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *inpu
     {
         border_mode_to_use = BorderMode::CONSTANT;
     }
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode_to_use, info.constant_border_value);
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode_to_use, info.constant_border_value);
 }
 
 void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,
diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
index b121ee7b99..e5d0d2d630 100644
--- a/src/runtime/CL/functions/CLScharr3x3.cpp
+++ b/src/runtime/CL/functions/CLScharr3x3.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
 
-#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLScharr3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -41,5 +42,5 @@ void CLScharr3x3::configure(const CLCompileContext &compile_context, ICLTensor *
     auto k = arm_compute::support::cpp14::make_unique<CLScharr3x3Kernel>();
     k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index c7d7df75d2..374da91b78 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -23,9 +23,11 @@
  */
 #include "arm_compute/runtime/CL/functions/CLSelect.h"
 
-#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
+
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index f36550ba91..940540563a 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
index 566a4a1534..78376f935a 100644
--- a/src/runtime/CL/functions/CLSobel3x3.cpp
+++ b/src/runtime/CL/functions/CLSobel3x3.cpp
@@ -23,14 +23,17 @@
  */
 #include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
 
-#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLSobel3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
+CLSobel3x3::~CLSobel3x3() = default;
+
 void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
@@ -41,5 +44,5 @@ void CLSobel3x3::configure(const CLCompileContext &compile_context, ICLTensor *i
     auto k = arm_compute::support::cpp14::make_unique<CLSobel3x3Kernel>();
     k->configure(compile_context, input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index f70e4f36f5..fa5d8945fb 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -24,20 +24,29 @@
 #include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLSobel5x5::CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+    : _memory_group(std::move(memory_manager)),
+      _sobel_hor(support::cpp14::make_unique<CLSobel5x5HorKernel>()),
+      _sobel_vert(support::cpp14::make_unique<CLSobel5x5VertKernel>()),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>()),
+      _tmp_x(),
+      _tmp_y()
 {
 }
 
+CLSobel5x5::~CLSobel5x5() = default;
+
 void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
@@ -58,8 +67,8 @@ void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *i
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
         _tmp_y.allocator()->allocate();
     }
@@ -67,27 +76,27 @@ void CLSobel5x5::configure(const CLCompileContext &compile_context, ICLTensor *i
     {
         _tmp_x.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
     }
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
     }
-    _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 void CLSobel5x5::run()
 {
-    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(*_border_handler, false);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    CLScheduler::get().enqueue(_sobel_hor, false);
-    CLScheduler::get().enqueue(_sobel_vert);
+    CLScheduler::get().enqueue(*_sobel_hor, false);
+    CLScheduler::get().enqueue(*_sobel_vert);
 }
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index 792432e841..f462adb0ed 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -24,20 +24,29 @@
 #include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 CLSobel7x7::CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+    : _memory_group(std::move(memory_manager)),
+      _sobel_hor(support::cpp14::make_unique<CLSobel7x7HorKernel>()),
+      _sobel_vert(support::cpp14::make_unique<CLSobel7x7VertKernel>()),
+      _border_handler(support::cpp14::make_unique<CLFillBorderKernel>()),
+      _tmp_x(),
+      _tmp_y()
 {
 }
 
+CLSobel7x7::~CLSobel7x7() = default;
+
 void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output_x, output_y, border_mode, constant_border_value);
@@ -58,8 +67,8 @@ void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *i
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(compile_context, input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(compile_context, &_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
         _tmp_y.allocator()->allocate();
     }
@@ -67,27 +76,27 @@ void CLSobel7x7::configure(const CLCompileContext &compile_context, ICLTensor *i
     {
         _tmp_x.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(compile_context, input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(compile_context, &_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
     }
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(compile_context, input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(compile_context, nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
     }
-    _border_handler.configure(compile_context, input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 void CLSobel7x7::run()
 {
-    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(*_border_handler, false);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    CLScheduler::get().enqueue(_sobel_hor, false);
-    CLScheduler::get().enqueue(_sobel_vert);
+    CLScheduler::get().enqueue(*_sobel_hor, false);
+    CLScheduler::get().enqueue(*_sobel_vert);
 }
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index f7b2935622..4caf91488e 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -24,111 +24,75 @@
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "src/core/helpers/SoftmaxHelpers.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 template <bool IS_LOG>
 CLSoftmaxLayerGeneric<IS_LOG>::CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_ptr(), _reshape(), _max(), _sum(), _tmp(), _input_flattened(), _output_flattened(),
-      _needs_flattening(false)
+    : _memory_group(std::move(memory_manager)),
+      _permute_input(),
+      _permute_output(),
+      _max_shift_exp_sum_kernel(support::cpp14::make_unique<CLLogits1DMaxShiftExpSumKernel>()),
+      _norm_kernel(support::cpp14::make_unique<CLLogits1DNormKernel>()),
+      _max(),
+      _sum(),
+      _tmp(),
+      _input_permuted(),
+      _output_permuted(),
+      _needs_permute()
 {
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t first_n_reduce_axes)
-{
-    configure_reshape_input_kernel(CLKernelLibrary::get().get_compile_context(), input, output, first_n_reduce_axes);
-}
+CLSoftmaxLayerGeneric<IS_LOG>::~CLSoftmaxLayerGeneric() = default;
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t first_n_reduce_axes)
-{
-    // Flatten the input
-    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), first_n_reduce_axes);
-
-    // Initialize the flat input
-    _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
-
-    // If we need to flatten the input, we can use CLFlattenKernel or CLReshapeKernel
-    // If the number of reduced axes is 3 (max dimension), which means collapsing all axes except the batch axis, we use CLFlattenKernel.
-    // In all other cases we have to use CLReshapeKernel
-    // Note that the "other cases" include both:
-    //   1. first_n_reduce_axes < 3: Reduce the first 1 (no need to reduce) or 2 dimensions (inclusive)
-    //   2. first_n_reduce_axes == 4: Reduce all 4 dimensions. This can only be handled by CLReshapeKernel instead of CLFlattenKernel.
-    if(first_n_reduce_axes == 3)
-    {
-        auto flatten = support::cpp14::make_unique<CLFlattenLayer>();
-        flatten->configure(compile_context, input, &_input_flattened);
-        _flatten_ptr = std::move(flatten);
-    }
-    else
-    {
-        auto reshape_ptr = support::cpp14::make_unique<CLReshapeLayer>();
-        reshape_ptr->configure(compile_context, input, &_input_flattened);
-        _flatten_ptr = std::move(reshape_ptr);
-    }
-
-    // We need to init the output tensor here. Indeed, the reshape kernel expects
-    // both tensors to be already initialized
-    auto_init_if_empty(*output->info(), *input->info()->clone());
-}
-
-template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, beta, axis);
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayerGeneric<IS_LOG>::validate(input->info(), output->info(), beta, axis));
 
-    // Convert reduce-before axis (inclusive) to first n axes to reduce
-    size_t first_n_reduce_axes = dim_index_2_num_dims(axis, input->info()->num_dimensions());
+    const size_t actual_axis = static_cast<size_t>(wrap_around(axis, static_cast<int32_t>(input->info()->num_dimensions())));
 
-    // We only need flattening when the number of axes to reduce is greater than 1
-    _needs_flattening = first_n_reduce_axes > 1;
-
-    // If we are dealing with a 4D tensor, we will:
-    // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
-    // - Execute all the pipeline (reduction + normalization) on the flattened tensor
-    // - Reshape the flattened output into the real output
-    if(_needs_flattening)
+    _needs_permute              = actual_axis != 0;
+    ICLTensor       *tmp_output = output;
+    const ICLTensor *tmp_input  = _needs_permute ? &_input_permuted : input;
+    if(_needs_permute)
     {
-        // Add to the memory manager _input_flattened
-        _memory_group.manage(&_input_flattened);
-
-        // Cofigure _flatten_kernel and _input_flattened
-        configure_reshape_input_kernel(input, output, first_n_reduce_axes);
+        _memory_group.manage(&_input_permuted);
+        _memory_group.manage(&_output_permuted);
+        _permute_input.configure(compile_context, input, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+        tmp_output = &_output_permuted;
     }
 
-    // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
-    // or it is the original input case (2D case)
-    const ICLTensor *input_2D = (_needs_flattening ? &_input_flattened : input);
-
-    // Create intermediate tensors shapes
-    TensorInfo input_info    = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
-    DataType   tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::S32 : input_2D->info()->data_type();
-    TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+    // Create intermediate tensors
+    DataType   tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->info()->data_type()) ? DataType::S32 : tmp_input->info()->data_type();
+    TensorInfo tensor_info_tmp(tmp_input->info()->clone()->set_data_type(tmp_data_type));
     _tmp.allocator()->init(tensor_info_tmp);
-
-    TensorShape max_sum_shape = input_2D->info()->tensor_shape();
+    TensorShape max_sum_shape = tmp_input->info()->tensor_shape();
     max_sum_shape.set(0, 1);
-    _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
-    _sum.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type));
+    _max.allocator()->init(tmp_input->info()->clone()->set_tensor_shape(max_sum_shape));
+    _sum.allocator()->init(tmp_input->info()->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type));
 
     // Set GPU target to kernels
-    _max_shift_exp_sum_kernel.set_target(CLScheduler::get().target());
+    _max_shift_exp_sum_kernel->set_target(CLScheduler::get().target());
 
     // Manage intermediate buffers
     _memory_group.manage(&_tmp);
@@ -138,49 +102,43 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_co
     SoftmaxKernelInfo softmax_info;
     softmax_info.beta            = beta;
     softmax_info.is_log          = IS_LOG;
-    softmax_info.input_data_type = input_2D->info()->data_type();
+    softmax_info.input_data_type = tmp_input->info()->data_type();
 
     // Configure kernels
-    _max_shift_exp_sum_kernel.configure(compile_context, input_2D, &_max, &_tmp, &_sum, softmax_info);
-
-    if(_needs_flattening)
-    {
-        // Add to the memory manager _output_flattened
-        _memory_group.manage(&_output_flattened);
-
-        // The normalization kernel stores the result in a flat output tensor
-        _norm_kernel.configure(compile_context, &_tmp, &_sum, &_output_flattened, softmax_info);
-
-        // Reshape the flat output into a the requested (4D) output
-        _reshape.configure(compile_context, &_output_flattened, output);
-
-        // Allocate the intermediate flat tensors
-        _input_flattened.allocator()->allocate();
-        _output_flattened.allocator()->allocate();
-    }
-    else
-    {
-        // Softmax 2D case
-        _norm_kernel.configure(compile_context, &_tmp, &_sum, output, softmax_info);
-    }
+    _max_shift_exp_sum_kernel->configure(compile_context, tmp_input, &_max, &_tmp, &_sum, softmax_info);
+    _norm_kernel->configure(compile_context, &_tmp, &_sum, tmp_output, softmax_info);
 
     // Allocate intermediate buffers
     _tmp.allocator()->allocate();
     _max.allocator()->allocate();
     _sum.allocator()->allocate();
+    if(_needs_permute)
+    {
+        _permute_output.configure(compile_context, &_output_permuted, output, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+        _input_permuted.allocator()->allocate();
+        _output_permuted.allocator()->allocate();
+    }
 }
 
 template <bool IS_LOG>
-Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
+Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 0, "Only axis 0 supported in tensors");
     ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() <= axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-input->num_dimensions()) || static_cast<int32_t>(input->num_dimensions()) <= axis);
 
-    // Convert reduce-before axis (inclusive) to first n axes to reduce
-    size_t first_n_reduce_axes = dim_index_2_num_dims(axis, input->num_dimensions());
+    const size_t actual_axis   = static_cast<size_t>(wrap_around(axis, static_cast<int32_t>(input->num_dimensions())));
+    const bool   needs_permute = actual_axis != 0;
+    if(needs_permute)
+    {
+        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(*input, permutation_vector);
+        TensorInfo              input_permuted(input->clone()->set_tensor_shape(permuted_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &input_permuted, permutation_vector));
+        TensorInfo output_permuted(output->clone()->set_tensor_shape(permuted_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&output_permuted, output, permutation_vector));
+    }
 
     // Create intermediate tensor info
     DataType   tmp_data_type = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
@@ -191,23 +149,6 @@ Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
     TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
     TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
 
-    const bool needs_flattening = (first_n_reduce_axes > 1);
-
-    if(needs_flattening)
-    {
-        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, first_n_reduce_axes);
-        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
-
-        if(first_n_reduce_axes == 3)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &tensor_info_flat));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(input, &tensor_info_flat));
-        }
-    }
-
     SoftmaxKernelInfo softmax_info;
     softmax_info.beta            = beta;
     softmax_info.is_log          = IS_LOG;
@@ -216,12 +157,6 @@ Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DNormKernel::validate(&tensor_info_tmp, &tensor_info_sum, output, softmax_info));
 
-    if(needs_flattening)
-    {
-        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input);
-        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
-    }
-
     return Status{};
 }
 
@@ -230,17 +165,17 @@ void           CLSoftmaxLayerGeneric<IS_LOG>::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_needs_flattening)
+    if(_needs_permute)
     {
-        _flatten_ptr->run();
+        _permute_input.run();
     }
 
-    CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
-    CLScheduler::get().enqueue(_norm_kernel, !_needs_flattening);
+    CLScheduler::get().enqueue(*_max_shift_exp_sum_kernel, false);
+    CLScheduler::get().enqueue(*_norm_kernel, !_needs_permute);
 
-    if(_needs_flattening)
+    if(_needs_permute)
     {
-        _reshape.run();
+        _permute_output.run();
     }
 }
 
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index eea3cb535f..e83def5677 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -29,14 +29,21 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLSpaceToBatchLayer::CLSpaceToBatchLayer()
-    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+    : _space_to_batch_kernel(support::cpp14::make_unique<CLSpaceToBatchLayerKernel>()),
+      _memset_kernel(support::cpp14::make_unique<CLMemsetKernel>()),
+      _has_padding(false)
 {
 }
 
+CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default;
+
 void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
@@ -49,9 +56,9 @@ void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, con
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _memset_kernel->configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(compile_context, input, block_shape, paddings, output);
+    _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output);
 }
 
 void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
@@ -67,9 +74,9 @@ void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, con
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _memset_kernel.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _memset_kernel->configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
@@ -94,8 +101,8 @@ void CLSpaceToBatchLayer::run()
     // Zero out output only if we have paddings
     if(_has_padding)
     {
-        CLScheduler::get().enqueue(_memset_kernel, true);
+        CLScheduler::get().enqueue(*_memset_kernel, true);
     }
-    CLScheduler::get().enqueue(_space_to_batch_kernel, true);
+    CLScheduler::get().enqueue(*_space_to_batch_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
index 06aa92d6fa..db8c4953cc 100644
--- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
@@ -29,14 +29,18 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLSpaceToDepthLayer::CLSpaceToDepthLayer()
-    : _space_to_depth_kernel()
+    : _space_to_depth_kernel(support::cpp14::make_unique<CLSpaceToDepthLayerKernel>())
 {
 }
 
+CLSpaceToDepthLayer::~CLSpaceToDepthLayer() = default;
+
 void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
@@ -44,7 +48,7 @@ void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, i
 
 void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
 {
-    _space_to_depth_kernel.configure(compile_context, input, output, block_shape);
+    _space_to_depth_kernel->configure(compile_context, input, output, block_shape);
 }
 
 Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -54,6 +58,6 @@ Status CLSpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo
 
 void CLSpaceToDepthLayer::run()
 {
-    CLScheduler::get().enqueue(_space_to_depth_kernel, true);
+    CLScheduler::get().enqueue(*_space_to_depth_kernel, true);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index db0b14b9a2..0b27371e3f 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 39f0ab4779..f4aa78a72d 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -42,6 +44,8 @@ CLStackLayer::CLStackLayer() // NOLINT
 {
 }
 
+CLStackLayer::~CLStackLayer() = default;
+
 void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, output);
@@ -50,14 +54,15 @@ void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, IC
 void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
 {
     _num_inputs = input.size();
-    _stack_kernels.resize(_num_inputs);
+    _stack_kernels.reserve(_num_inputs);
 
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
 
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
-        _stack_kernels[i].configure(compile_context, input[i], axis_u, i, _num_inputs, output);
+        _stack_kernels.emplace_back(support::cpp14::make_unique<CLStackLayerKernel>());
+        _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output);
     }
 }
 
@@ -87,7 +92,7 @@ void CLStackLayer::run()
 {
     for(unsigned i = 0; i < _num_inputs; i++)
     {
-        CLScheduler::get().enqueue(_stack_kernels[i], false);
+        CLScheduler::get().enqueue(*_stack_kernels[i], false);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index b78073dd67..3f6814f5ce 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
index 3d2d1853ca..8282f37e4b 100644
--- a/src/runtime/CL/functions/CLTableLookup.cpp
+++ b/src/runtime/CL/functions/CLTableLookup.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLTableLookup.h"
 
-#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+#include "src/core/CL/kernels/CLTableLookupKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
index bdbf37e841..250f6f034f 100644
--- a/src/runtime/CL/functions/CLThreshold.cpp
+++ b/src/runtime/CL/functions/CLThreshold.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLThreshold.h"
 
-#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+#include "src/core/CL/kernels/CLThresholdKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
index 68efad0125..8384e48baf 100644
--- a/src/runtime/CL/functions/CLTile.cpp
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLTile.h"
 
-#include "arm_compute/core/CL/kernels/CLTileKernel.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index 8cade66a90..43fa7a012a 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
 
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/CL/functions/CLUpsampleLayer.cpp b/src/runtime/CL/functions/CLUpsampleLayer.cpp
index e9456c100b..10b4b76a5e 100644
--- a/src/runtime/CL/functions/CLUpsampleLayer.cpp
+++ b/src/runtime/CL/functions/CLUpsampleLayer.cpp
@@ -26,15 +26,19 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLUpsampleLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLUpsampleLayer::CLUpsampleLayer() // NOLINT
-    : _upsample(),
+    : _upsample(support::cpp14::make_unique<CLUpsampleLayerKernel>()),
       _output(nullptr)
 {
 }
 
+CLUpsampleLayer::~CLUpsampleLayer() = default;
+
 Status CLUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
                                  const Size2D &info, const InterpolationPolicy upsampling_policy)
 {
@@ -53,11 +57,11 @@ void CLUpsampleLayer::configure(const CLCompileContext &compile_context, ICLTens
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     _output = output;
-    _upsample.configure(compile_context, input, _output, info, upsampling_policy);
+    _upsample->configure(compile_context, input, _output, info, upsampling_policy);
 }
 
 void CLUpsampleLayer::run()
 {
-    CLScheduler::get().enqueue(_upsample, false);
+    CLScheduler::get().enqueue(*_upsample, false);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
index fffc58c8d0..86e5a7bd86 100644
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
 
-#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLWarpAffineKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -42,5 +43,5 @@ void CLWarpAffine::configure(const CLCompileContext &compile_context, ICLTensor
     auto k = arm_compute::support::cpp14::make_unique<CLWarpAffineKernel>();
     k->configure(compile_context, input, output, matrix, policy);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
index 2b4b187e38..7e8bc5cdff 100644
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
 
-#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLWarpPerspectiveKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -42,5 +43,5 @@ void CLWarpPerspective::configure(const CLCompileContext &compile_context, ICLTe
     auto k = arm_compute::support::cpp14::make_unique<CLWarpPerspectiveKernel>();
     k->configure(compile_context, input, output, matrix, policy);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(compile_context, input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
 }
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 09a35a6f27..7af42904e8 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -28,6 +28,15 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
@@ -90,11 +99,13 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
 } // namespace
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr),
-      _is_prepared(false)
+    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(support::cpp14::make_unique<CLWinogradFilterTransformKernel>()),
+      _output_transform(support::cpp14::make_unique<CLWinogradOutputTransformKernel>()), _input0(), _input1(), _batched_mm_output(), _original_weights(nullptr), _is_prepared(false)
 {
 }
 
+CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default;
+
 void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
                                            bool enable_fast_math)
 {
@@ -102,7 +113,7 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
 }
 
 void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                           const PadStrideInfo &conv_info,
+                                           const PadStrideInfo       &conv_info,
                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     // Get indices for the width and height
@@ -139,7 +150,7 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte
     _input_transform.configure(compile_context, input, &_input0, winograd_info);
 
     // Configure filter transform
-    _filter_transform.configure(compile_context, weights, &_input1, winograd_info);
+    _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
 
     // Configure batched matrix multiply
     _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
@@ -147,7 +158,7 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte
                                                                                                                   (input->info()->data_type() == DataType::F16)));
 
     // Configure output transform
-    _output_transform.configure(compile_context, &_batched_mm_output, biases, output, winograd_info, act_info);
+    _output_transform->configure(compile_context, &_batched_mm_output, biases, output, winograd_info, act_info);
 
     // Allocate temporary tensors
     _input0.allocator()->allocate();
@@ -218,7 +229,7 @@ void CLWinogradConvolutionLayer::run()
     _batched_mm.run();
 
     // Run output transform
-    CLScheduler::get().enqueue(_output_transform);
+    CLScheduler::get().enqueue(*_output_transform);
 }
 
 void CLWinogradConvolutionLayer::prepare()
@@ -227,7 +238,7 @@ void CLWinogradConvolutionLayer::prepare()
     {
         // Run filter transform and mark original weights as unused
         _input1.allocator()->allocate();
-        CLScheduler::get().enqueue(_filter_transform, false);
+        CLScheduler::get().enqueue(*_filter_transform, false);
         _original_weights->mark_as_unused();
 
         // Prepare GEMM and release reshaped weights if marked unused by CLGEMM
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index 9498206549..308c41f714 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
 #include "arm_compute/core/Error.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLWinogradInputTransformKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
@@ -40,7 +41,7 @@ void CLWinogradInputTransform::configure(const CLCompileContext &compile_context
     auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
     k->configure(compile_context, input, output, winograd_info);
     _kernel = std::move(k);
-    _border_handler.configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+    _border_handler->configure(compile_context, input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
 }
 
 Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
diff --git a/src/runtime/CL/functions/CLYOLOLayer.cpp b/src/runtime/CL/functions/CLYOLOLayer.cpp
index d553f97009..46bf220b0c 100644
--- a/src/runtime/CL/functions/CLYOLOLayer.cpp
+++ b/src/runtime/CL/functions/CLYOLOLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLYOLOLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLYOLOLayerKernel.h"
 #include "support/MemorySupport.h"
 
 using namespace arm_compute;
diff --git a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
similarity index 85%
rename from arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h
rename to src/runtime/CL/gemm/CLGEMMKernelSelection.h
index a6bc008103..f6fad7e4ff 100644
--- a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
@@ -21,15 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLGEMMKERNELSELECTION_H
-#define ARM_COMPUTE_CLGEMMKERNELSELECTION_H
+#ifndef SRC_CLGEMMKERNELSELECTION_H
+#define SRC_CLGEMMKERNELSELECTION_H
 
 #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h"
 
-#include <memory>
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -62,4 +62,4 @@ class CLGEMMKernelSelectionFactory final
 };
 } // namespace cl_gemm
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMKERNELSELECTION_H */
+#endif /* SRC_CLGEMMKERNELSELECTION_H */
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
index 8b1c9a5622..0bda38e5e9 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
@@ -44,7 +44,7 @@ CLGEMMKernelType CLGEMMKernelSelectionBifrost::select_kernel(const CLGEMMKernelS
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMKernelSelectionBifrost::*)(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMKernelSelectionBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
     // Default configurations for Bifrost architectures
     static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
@@ -68,11 +68,22 @@ CLGEMMKernelType CLGEMMKernelSelectionBifrost::select_kernel(const CLGEMMKernelS
         { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionBifrost::default_q8 }
     };
 
+    // Mali-G52 configurations
+    static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs =
+    {
+        { DataType::F32, &CLGEMMKernelSelectionBifrost::g52_f32 },
+        { DataType::F16, &CLGEMMKernelSelectionBifrost::g52_f16 },
+        { DataType::QASYMM8, &CLGEMMKernelSelectionBifrost::default_q8 },
+        { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionBifrost::default_q8 },
+        { DataType::QSYMM8, &CLGEMMKernelSelectionBifrost::default_q8 },
+        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionBifrost::default_q8 }
+    };
+
     // Mali-G76 configurations
     static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs =
     {
         { DataType::F32, &CLGEMMKernelSelectionBifrost::g76_f32 },
-        { DataType::F16, &CLGEMMKernelSelectionBifrost::default_f16 },
+        { DataType::F16, &CLGEMMKernelSelectionBifrost::g76_f16 },
         { DataType::QASYMM8, &CLGEMMKernelSelectionBifrost::default_q8 },
         { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionBifrost::default_q8 },
         { DataType::QSYMM8, &CLGEMMKernelSelectionBifrost::default_q8 },
@@ -86,26 +97,34 @@ CLGEMMKernelType CLGEMMKernelSelectionBifrost::select_kernel(const CLGEMMKernelS
         case GPUTarget::G71:
             if(gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
             {
-                return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant);
+                return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G76:
             if(gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
             {
-                return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant);
+                return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+            }
+            ARM_COMPUTE_ERROR("Not supported data type");
+        case GPUTarget::G52:
+            if(gemm_g52_configs.find(data_type) != gemm_g52_configs.end())
+            {
+                return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         default:
             if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
             {
-                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant);
+                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
     }
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
+    ARM_COMPUTE_UNUSED(b);
+
     CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE_V1;
 
     if(is_rhs_constant)
@@ -143,9 +162,10 @@ CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_f32(unsigned int m, unsig
     return gemm_type;
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    ARM_COMPUTE_UNUSED(n, k);
+    ARM_COMPUTE_UNUSED(n, k, b);
+
     if(is_rhs_constant)
     {
         if(m == 1)
@@ -163,9 +183,9 @@ CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_f16(unsigned int m, unsig
     }
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    ARM_COMPUTE_UNUSED(m, n, k);
+    ARM_COMPUTE_UNUSED(m, n, k, b);
 
     if(is_rhs_constant)
     {
@@ -177,38 +197,369 @@ CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_q8(unsigned int m, unsign
     }
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMKernelSelectionBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE_V1;
+    ARM_COMPUTE_UNUSED(b);
 
-    if(is_rhs_constant)
+    if(!is_rhs_constant)
     {
-        if((m > 1) && (n < 16))
+        return CLGEMMKernelType::NATIVE_V1;
+    }
+    if(m == 1)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+    if(k <= 496)
+    {
+        if(n <= 544)
         {
-            gemm_type = CLGEMMKernelType::RESHAPED;
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
-        else if(m == 1)
+        else
         {
-            gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
+            return CLGEMMKernelType::RESHAPED;
+        }
+    }
+    else
+    {
+        if(k <= 588)
+        {
+            if(k <= 552)
+            {
+                if(m <= 148)
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+                else
+                {
+                    if(m <= 278)
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                }
+            }
+            else
+            {
+                return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+            }
         }
         else
         {
-            if((k > 256) && (m > 4))
+            return CLGEMMKernelType::RESHAPED;
+        }
+    }
+}
+
+CLGEMMKernelType CLGEMMKernelSelectionBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(b);
+
+    if (!is_rhs_constant)
+    {
+        return CLGEMMKernelType::NATIVE_V1;
+    }
+
+    if (m == 1)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+
+    const float r_mn  = static_cast<float>(m) / static_cast<float>(n);
+    const float r_mk  = static_cast<float>(m) / static_cast<float>(k);
+    const float r_nk  = static_cast<float>(n) / static_cast<float>(k);
+    const float r_mnk = static_cast<float>(m) / (static_cast<float>(n) * static_cast<float>(k));
+
+    if(r_mn <= 1.5469f)
+    {
+        if(r_mk <= 0.8766f)
+        {
+            if(r_mk <= 0.0211f)
+            {
+                if(r_mnk <= 77.5833f)
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+            }
+            else
+            {
+                if(r_nk <= 0.0832f)
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+            }
+        }
+        else
+        {
+            if(r_mnk <= 193.0000f)
+            {
+                if(r_mn <= 0.9948f)
+                {
+                    if(r_mk <= 2.5453f)
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+            }
+            else
+            {
+                return CLGEMMKernelType::RESHAPED;
+            }
+        }
+    }
+    else
+    {
+        if(r_mn <= 17.7370f)
+        {
+            if(r_mnk <= 1391.2875f)
+            {
+                if(r_mk <= 2.9724f)
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+                else
+                {
+                    if(r_mnk <= 470.0000f)
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                }
+            }
+            else
             {
-                gemm_type = CLGEMMKernelType::RESHAPED;
+                if(r_nk <= 0.1381f)
+                {
+                    if(r_mnk <= 9040.5000f)
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                }
+                else
+                {
+                    if(r_mn <= 5.6790f)
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                }
+            }
+        }
+        else
+        {
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+    }
+}
+
+CLGEMMKernelType CLGEMMKernelSelectionBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    ARM_COMPUTE_UNUSED(b);
+
+    if (!is_rhs_constant)
+    {
+        return CLGEMMKernelType::NATIVE_V1;
+    }
+
+    if (m == 1)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+
+    const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+    const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+
+    if(k <= 212)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+    else
+    {
+        if(r_nk <= 0.4990234375f)
+        {
+            if(k <= 1392)
+            {
+                return CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
-                gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                if(m <= 325)
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+            }
+        }
+        else
+        {
+            if(k <= 471)
+            {
+                return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+            }
+            else
+            {
+                if(r_mn <= 0.04475911520421505f)
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
             }
         }
     }
+}
 
-    return gemm_type;
+CLGEMMKernelType CLGEMMKernelSelectionBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    if (!is_rhs_constant)
+    {
+        return CLGEMMKernelType::NATIVE_V1;
+    }
+
+    if (m == 1)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+
+    if(n <= 127.0000f)
+    {
+        if(n <= 63.5000f)
+        {
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+        else
+        {
+            if(m <= 3616.0000f)
+            {
+                if(b <= 18.5000f)
+                {
+                    if(m <= 2970.5000f)
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                    else
+                    {
+                        if(k <= 104.0000f)
+                        {
+                            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                        }
+                        else
+                        {
+                            return CLGEMMKernelType::RESHAPED;
+                        }
+                    }
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+            }
+            else
+            {
+                return CLGEMMKernelType::RESHAPED;
+            }
+        }
+    }
+    else
+    {
+        if(m <= 12.5000f)
+        {
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+        else
+        {
+            if(k <= 104.0000f)
+            {
+                if(b <= 18.5000f)
+                {
+                    if(m <= 490.0000f)
+                    {
+                        if(n <= 272.0000f)
+                        {
+                            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                        }
+                        else
+                        {
+                            return CLGEMMKernelType::RESHAPED;
+                        }
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+            }
+            else
+            {
+                if(m <= 226.0000f)
+                {
+                    if(n <= 140.0000f)
+                    {
+                        if(m <= 179.5000f)
+                        {
+                            return CLGEMMKernelType::RESHAPED;
+                        }
+                        else
+                        {
+                            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                        }
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                }
+                else
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+            }
+        }
+    }
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMKernelSelectionBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
+    ARM_COMPUTE_UNUSED(b);
+
     if(is_rhs_constant)
     {
         if(m == 1)
diff --git a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
similarity index 72%
rename from arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
rename to src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
index 815c2c8cef..6831a12aec 100644
--- a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLGEMMKERNELSELECTIONBIFROST_H
-#define ARM_COMPUTE_CLGEMMKERNELSELECTIONBIFROST_H
+#ifndef SRC_CLGEMMKERNELSELECTIONBIFROST_H
+#define SRC_CLGEMMKERNELSELECTIONBIFROST_H
 
 #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
 
@@ -44,12 +44,15 @@ class CLGEMMKernelSelectionBifrost final : public ICLGEMMKernelSelection
     CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
 
 private:
-    CLGEMMKernelType g76_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType g71_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
+    CLGEMMKernelType g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 };
 } // namespace cl_gemm
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMKERNELSELECTIONBIFROST_H */
+#endif /* SRC_CLGEMMKERNELSELECTIONBIFROST_H */
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
index 44700ad4f4..d172a827b5 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
@@ -45,7 +45,7 @@ CLGEMMKernelType CLGEMMKernelSelectionMidgard::select_kernel(const CLGEMMKernelS
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMKernelSelectionMidgard::*)(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMKernelSelectionMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
     // Configurations for Midgard architectures
     static std::map<DataType, FunctionExecutorPtr> gemm_configs =
@@ -62,31 +62,31 @@ CLGEMMKernelType CLGEMMKernelSelectionMidgard::select_kernel(const CLGEMMKernelS
 
     if(gemm_configs.find(data_type) != gemm_configs.end())
     {
-        return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant);
+        return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
     }
 
     ARM_COMPUTE_ERROR("Not supported data type");
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    ARM_COMPUTE_UNUSED(n, k);
+    ARM_COMPUTE_UNUSED(n, k, b);
 
     // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
     return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1;
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    ARM_COMPUTE_UNUSED(n, k);
+    ARM_COMPUTE_UNUSED(n, k, b);
 
     // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
     return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1;
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    ARM_COMPUTE_UNUSED(m, n, k, is_rhs_constant);
+    ARM_COMPUTE_UNUSED(m, n, k, b, is_rhs_constant);
 
     return CLGEMMKernelType::NATIVE;
 }
diff --git a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
similarity index 86%
rename from arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
rename to src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
index 4689f0c041..3f6003f7dc 100644
--- a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLGEMMKERNELSELECTIONMIDGARD_H
-#define ARM_COMPUTE_CLGEMMKERNELSELECTIONMIDGARD_H
+#ifndef SRC_CLGEMMKERNELSELECTIONMIDGARD_H
+#define SRC_CLGEMMKERNELSELECTIONMIDGARD_H
 
 #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
 
@@ -44,10 +44,10 @@ class CLGEMMKernelSelectionMidgard final : public ICLGEMMKernelSelection
     CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
 
 private:
-    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
+    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 };
 } // namespace cl_gemm
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMKERNELSELECTIONMIDGARD_H */
+#endif /* SRC_CLGEMMKERNELSELECTIONMIDGARD_H */
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
index 8b4c9e75e8..da41859b87 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "src/core/CL/gemm/CLGEMMHelpers.h"
 
 #include <map>
 #include <utility>
@@ -44,10 +44,10 @@ CLGEMMKernelType CLGEMMKernelSelectionValhall::select_kernel(const CLGEMMKernelS
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMKernelSelectionValhall::*)(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMKernelSelectionValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
-    // Configurations for Valhall architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_configs =
+    // Default configurations for Valhall architectures
+    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
     {
         { DataType::F32, &CLGEMMKernelSelectionValhall::default_f32 },
         { DataType::F16, &CLGEMMKernelSelectionValhall::default_f16 },
@@ -57,33 +57,157 @@ CLGEMMKernelType CLGEMMKernelSelectionValhall::select_kernel(const CLGEMMKernelS
         { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionValhall::default_q8 }
     };
 
+    // Mali-G77 configurations
+    static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs =
+    {
+        { DataType::F32, &CLGEMMKernelSelectionValhall::default_f32 },
+        { DataType::F16, &CLGEMMKernelSelectionValhall::g77_f16 },
+        { DataType::QASYMM8, &CLGEMMKernelSelectionValhall::default_q8 },
+        { DataType::QASYMM8_SIGNED, &CLGEMMKernelSelectionValhall::default_q8 },
+        { DataType::QSYMM8, &CLGEMMKernelSelectionValhall::default_q8 },
+        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMKernelSelectionValhall::default_q8 }
+    };
+
     const DataType data_type = params.data_type;
 
-    if(gemm_configs.find(data_type) != gemm_configs.end())
+    switch(_target)
     {
-        return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.is_rhs_constant);
+        case GPUTarget::G77:
+            if(gemm_g77_configs.find(data_type) != gemm_g77_configs.end())
+            {
+                return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+            }
+            ARM_COMPUTE_ERROR("Not supported data type");
+        default:
+            if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
+            {
+                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+            }
+            ARM_COMPUTE_ERROR("Not supported data type");
     }
-
-    ARM_COMPUTE_ERROR("Not supported data type");
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMKernelSelectionValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    ARM_COMPUTE_UNUSED(m, n, k);
+    ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE_V1;
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMKernelSelectionValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    ARM_COMPUTE_UNUSED(m, n, k);
+    ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE_V1;
 }
 
-CLGEMMKernelType CLGEMMKernelSelectionValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMKernelSelectionValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+{
+    if (!is_rhs_constant)
+    {
+        return CLGEMMKernelType::NATIVE_V1;
+    }
+
+    if (m == 1)
+    {
+        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+    }
+
+    const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+    const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+    const float r_nk = static_cast<float>(n) / static_cast<float>(k);
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+    if(r_mk <= 0.6817956566810608)
+    {
+        if(workload <= 801.6000061035156)
+        {
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+        else
+        {
+            if(r_mn <= 0.0839829258620739)
+            {
+                return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+            }
+            else
+            {
+                if(r_mk <= 0.24917218834161758)
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+                else
+                {
+                    if(workload <= 2551.75)
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                    else
+                    {
+                        if(workload <= 5061.574951171875)
+                        {
+                            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                        }
+                        else
+                        {
+                            return CLGEMMKernelType::RESHAPED;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        if(r_mk <= 4.849947690963745)
+        {
+            if(workload <= 17618.4501953125)
+            {
+                if(workload <= 5224.699951171875)
+                {
+                    return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                }
+                else
+                {
+                    if(r_nk <= 0.7933054566383362)
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                }
+            }
+            else
+            {
+                if(workload <= 20275.2001953125)
+                {
+                    return CLGEMMKernelType::RESHAPED;
+                }
+                else
+                {
+                    if(r_mk <= 3.07421875)
+                    {
+                        return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                    }
+                    else
+                    {
+                        return CLGEMMKernelType::RESHAPED;
+                    }
+                }
+            }
+        }
+        else
+        {
+            return CLGEMMKernelType::RESHAPED_ONLY_RHS;
+        }
+    }
+}
+
+CLGEMMKernelType CLGEMMKernelSelectionValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    ARM_COMPUTE_UNUSED(m, n, k);
+    ARM_COMPUTE_UNUSED(m, n, k, b);
 
     if(is_rhs_constant)
     {
diff --git a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
similarity index 81%
rename from arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
rename to src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
index 8712be7531..82e46f694e 100644
--- a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLGEMMKERNELSELECTIONVALHALL_H
-#define ARM_COMPUTE_CLGEMMKERNELSELECTIONVALHALL_H
+#ifndef SRC_CLGEMMKERNELSELECTIONVALHALL_H
+#define SRC_CLGEMMKERNELSELECTIONVALHALL_H
 
 #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
 
@@ -44,10 +44,11 @@ class CLGEMMKernelSelectionValhall final : public ICLGEMMKernelSelection
     CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
 
 private:
-    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
+    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    CLGEMMKernelType g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 };
 } // namespace cl_gemm
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMKERNELSELECTIONVALHALL_H */
+#endif /* SRC_CLGEMMKERNELSELECTIONVALHALL_H */
diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
index 52644bf192..9490e0b219 100644
--- a/src/runtime/CL/tuners/BifrostTuner.cpp
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernels.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLKernels.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CL/tuners/MidgardTuner.cpp b/src/runtime/CL/tuners/MidgardTuner.cpp
index e49e15508b..72734f2207 100644
--- a/src/runtime/CL/tuners/MidgardTuner.cpp
+++ b/src/runtime/CL/tuners/MidgardTuner.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/CL/tuners/MidgardTuner.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernels.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "src/core/CL/CLKernels.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 55f62c1387..e6b0ec20b8 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -27,7 +27,8 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CPUUtils.h"
+#include "src/runtime/CPUUtils.h"
+#include "support/MemorySupport.h"
 #include "support/Mutex.h"
 
 #include <atomic>
@@ -71,61 +72,6 @@ class ThreadFeeder
     const unsigned int _end;
 };
 
-/** Given two dimensions and a maxium number of threads to utilise, calcualte the best
- * combination of threads that fit in (mutliplied together) max_threads.
- *
- * This algorithm assumes that work in either of the dimensions is equally difficult
- * to compute
- *
- * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension
- */
-std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n)
-{
-    /*
-     * We want the same ratio of threads in M & N to the ratio of m and n problem size
-     *
-     * Therefore:    mt/nt == m/n    where mt*nt == max_threads
-     *
-     *             max_threads/nt = mt    &    (max_threads/nt) * (m/n) = nt
-     *          nt^2 = max_threads * (m/n)
-     *          nt = sqrt( max_threads * (m/n) )
-     */
-    //ratio of m to n in problem dimensions
-    double ratio = m / static_cast<double>(n);
-
-    // nt = sqrt(max_threads * (m / n) )
-    const unsigned adjusted = std::round(
-                                  std::sqrt(max_threads * ratio));
-
-    //find the nearest factor of max_threads
-    for(unsigned i = 0; i != adjusted; ++i)
-    {
-        //try down
-        const unsigned adj_down = adjusted - i;
-        if(max_threads % adj_down == 0)
-        {
-            return { adj_down, max_threads / adj_down };
-        }
-
-        //try up
-        const unsigned adj_up = adjusted + i;
-        if(max_threads % adj_up == 0)
-        {
-            return { adj_up, max_threads / adj_up };
-        }
-    }
-
-    //we didn't find anything so lets bail out with maxes biased to the largest dimension
-    if(m > n)
-    {
-        return { std::min<unsigned>(m, max_threads), 1 };
-    }
-    else
-    {
-        return { 1, std::min<unsigned>(n, max_threads) };
-    }
-}
-
 /** Execute workloads[info.thread_id] first, then call the feeder to get the index of the next workload to run.
  *
  * Will run workloads until the feeder reaches the end of its range.
@@ -405,116 +351,6 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
 }
 #endif /* DOXYGEN_SKIP_THIS */
 
-void CPPScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
-
-    const Window &max_window = kernel->window();
-
-    if(hints.split_dimension() == IScheduler::split_dimensions_all)
-    {
-        /*
-         * if the split dim is size_t max then this signals we should parallelise over
-         * all dimensions
-         */
-        const std::size_t m = max_window.num_iterations(Window::DimX);
-        const std::size_t n = max_window.num_iterations(Window::DimY);
-
-        //in c++17 this can be swapped for   auto [ m_threads, n_threads ] = split_2d(...
-        unsigned m_threads, n_threads;
-        std::tie(m_threads, n_threads) = split_2d(_impl->_num_threads, m, n);
-
-        std::vector<IScheduler::Workload> workloads;
-        for(unsigned int ni = 0; ni != n_threads; ++ni)
-        {
-            for(unsigned int mi = 0; mi != m_threads; ++mi)
-            {
-                workloads.push_back(
-                    [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info)
-                {
-                    //narrow the window to our mi-ni workload
-                    Window win = max_window.split_window(Window::DimX, mi, m_threads)
-                                 .split_window(Window::DimY, ni, n_threads);
-
-                    win.validate();
-
-                    Window thread_locator;
-                    thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
-                    thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
-
-                    thread_locator.validate();
-
-                    kernel->run_nd(win, info, thread_locator);
-                });
-            }
-        }
-        run_workloads(workloads);
-    }
-    else
-    {
-        const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
-        const unsigned int num_threads    = std::min(num_iterations, _impl->_num_threads);
-
-        if(num_iterations == 0)
-        {
-            return;
-        }
-
-        if(!kernel->is_parallelisable() || num_threads == 1)
-        {
-            ThreadInfo info;
-            info.cpu_info = &_cpu_info;
-            if(tensors.empty())
-            {
-                kernel->run(max_window, info);
-            }
-            else
-            {
-                kernel->run_op(tensors, max_window, info);
-            }
-        }
-        else
-        {
-            unsigned int num_windows = 0;
-            switch(hints.strategy())
-            {
-                case StrategyHint::STATIC:
-                    num_windows = num_threads;
-                    break;
-                case StrategyHint::DYNAMIC:
-                {
-                    const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
-                    // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
-                    num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Unknown strategy");
-            }
-            std::vector<IScheduler::Workload> workloads(num_windows);
-            for(unsigned int t = 0; t < num_windows; t++)
-            {
-                //Capture 't' by copy, all the other variables by reference:
-                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info)
-                {
-                    Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
-                    win.validate();
-
-                    if(tensors.empty())
-                    {
-                        kernel->run(win, info);
-                    }
-                    else
-                    {
-                        kernel->run_op(tensors, win, info);
-                    }
-                };
-            }
-            run_workloads(workloads);
-        }
-    }
-}
-
 void CPPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors)
 {
     schedule_common(kernel, hints, tensors);
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index 6f67bc005f..96265ac757 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -37,11 +37,15 @@ void SingleThreadScheduler::set_num_threads(unsigned int num_threads)
 
 void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
 {
-    const Window      &max_window     = kernel->window();
-    const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
-    if(num_iterations < 1)
+    const Window &max_window = kernel->window();
+
+    if(hints.split_dimension() != IScheduler::split_dimensions_all)
     {
-        return;
+        const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+        if(num_iterations < 1)
+        {
+            return;
+        }
     }
 
     ThreadInfo info;
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 9d62733384..fdb4c9f0f6 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <list>
 
diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index 3507a3ac45..31f1fafd69 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <cstddef>
 #include <ios>
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index 4d6caaee01..a7dd464540 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CPUUtils.h"
+#include "src/runtime/CPUUtils.h"
 
 #include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/Error.h"
@@ -352,6 +352,10 @@ int get_max_cpus()
 
 namespace arm_compute
 {
+namespace utils
+{
+namespace cpu
+{
 void get_cpu_configuration(CPUInfo &cpuinfo)
 {
 #if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
@@ -460,5 +464,6 @@ unsigned int get_threads_hint()
 
     return num_threads_hint;
 }
-
+} // namespace cpu
+} // namespace utils
 } // namespace arm_compute
diff --git a/arm_compute/runtime/CPUUtils.h b/src/runtime/CPUUtils.h
similarity index 93%
rename from arm_compute/runtime/CPUUtils.h
rename to src/runtime/CPUUtils.h
index bcc2f666ea..452d3d58ca 100644
--- a/arm_compute/runtime/CPUUtils.h
+++ b/src/runtime/CPUUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,11 @@
 namespace arm_compute
 {
 class CPUInfo;
+
+namespace utils
+{
+namespace cpu
+{
 /** This function will try to detect the CPU configuration on the system and will fill
  *  the cpuinfo object accordingly to reflect this.
  *
@@ -40,5 +45,7 @@ void get_cpu_configuration(CPUInfo &cpuinfo);
  * @return The minumum number of common cores.
  */
 unsigned int get_threads_hint();
-}
+} // namespace cpu
+} // namespace utils
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_RUNTIME_CPU_UTILS_H */
diff --git a/src/runtime/DeviceProperties.cpp b/src/runtime/DeviceProperties.cpp
index 5d7ae020d7..ec9f4a16ed 100644
--- a/src/runtime/DeviceProperties.cpp
+++ b/src/runtime/DeviceProperties.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,12 @@
  */
 #include "arm_compute/runtime/DeviceProperties.h"
 
-#include "arm_compute/runtime/CPUUtils.h"
+#include "src/runtime/CPUUtils.h"
 
 namespace arm_compute
 {
 DeviceProperties::DeviceProperties()
 {
-    get_cpu_configuration(cpu_info);
+    utils::cpu::get_cpu_configuration(cpu_info);
 }
 } // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCMemory.cpp b/src/runtime/GLES_COMPUTE/GCMemory.cpp
index 998f8a5cc4..4d74555f4e 100644
--- a/src/runtime/GLES_COMPUTE/GCMemory.cpp
+++ b/src/runtime/GLES_COMPUTE/GCMemory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/GLES_COMPUTE/GCMemory.h"
 
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
index 9e23974b8d..807412eb17 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
@@ -29,6 +29,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index 48d8cb576b..fdb9a42f13 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
@@ -34,13 +34,13 @@ GCSoftmaxLayer::GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
 {
 }
 
-void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta, size_t reduce_end_axis)
+void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta, int32_t axis)
 {
-    ARM_COMPUTE_UNUSED(beta, reduce_end_axis);
+    ARM_COMPUTE_UNUSED(beta, axis);
 
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON(beta != 1.0f);
-    ARM_COMPUTE_ERROR_ON_MSG(reduce_end_axis != 0, "Reduce_end_axis must be 0 for GLES");
+    ARM_COMPUTE_ERROR_ON_MSG(axis != 0, "axis must be 0 for GLES");
 
     // Create intermediate tensors shapes
     _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 6b961d7dfc..43df3d5e23 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -23,17 +23,20 @@
  */
 #include "arm_compute/runtime/IScheduler.h"
 
+#include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/CPUUtils.h"
+#include "arm_compute/core/Window.h"
+#include "src/runtime/CPUUtils.h"
+#include "src/runtime/SchedulerUtils.h"
 
 namespace arm_compute
 {
 IScheduler::IScheduler()
     : _cpu_info()
 {
-    get_cpu_configuration(_cpu_info);
+    utils::cpu::get_cpu_configuration(_cpu_info);
     // Work out the best possible number of execution threads
-    _num_threads_hint = get_threads_hint();
+    _num_threads_hint = utils::cpu::get_threads_hint();
 }
 
 CPUInfo &IScheduler::cpu_info()
@@ -51,6 +54,120 @@ unsigned int IScheduler::num_threads_hint() const
 {
     return _num_threads_hint;
 }
+
+void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+    ARM_COMPUTE_UNUSED(kernel);
+    ARM_COMPUTE_UNUSED(hints);
+    ARM_COMPUTE_UNUSED(tensors);
+#ifndef BARE_METAL
+    const Window &max_window = kernel->window();
+    if(hints.split_dimension() == IScheduler::split_dimensions_all)
+    {
+        /*
+         * if the split dim is size_t max then this signals we should parallelise over
+         * all dimensions
+         */
+        const std::size_t m = max_window.num_iterations(Window::DimX);
+        const std::size_t n = max_window.num_iterations(Window::DimY);
+
+        //in c++17 this can be swapped for   auto [ m_threads, n_threads ] = split_2d(...
+        unsigned m_threads, n_threads;
+        std::tie(m_threads, n_threads) = scheduler_utils::split_2d(this->num_threads(), m, n);
+
+        std::vector<IScheduler::Workload> workloads;
+        for(unsigned int ni = 0; ni != n_threads; ++ni)
+        {
+            for(unsigned int mi = 0; mi != m_threads; ++mi)
+            {
+                workloads.push_back(
+                    [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info)
+                {
+                    //narrow the window to our mi-ni workload
+                    Window win = max_window.split_window(Window::DimX, mi, m_threads)
+                                 .split_window(Window::DimY, ni, n_threads);
+
+                    win.validate();
+
+                    Window thread_locator;
+                    thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+                    thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+
+                    thread_locator.validate();
+
+                    kernel->run_nd(win, info, thread_locator);
+                });
+            }
+        }
+        run_workloads(workloads);
+    }
+    else
+    {
+        const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+        const unsigned int num_threads    = std::min(num_iterations, this->num_threads());
+
+        if(num_iterations == 0)
+        {
+            return;
+        }
+
+        if(!kernel->is_parallelisable() || num_threads == 1)
+        {
+            ThreadInfo info;
+            info.cpu_info = &_cpu_info;
+            if(tensors.empty())
+            {
+                kernel->run(max_window, info);
+            }
+            else
+            {
+                kernel->run_op(tensors, max_window, info);
+            }
+        }
+        else
+        {
+            unsigned int num_windows = 0;
+            switch(hints.strategy())
+            {
+                case StrategyHint::STATIC:
+                    num_windows = num_threads;
+                    break;
+                case StrategyHint::DYNAMIC:
+                {
+                    const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
+                    // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
+                    num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Unknown strategy");
+            }
+            std::vector<IScheduler::Workload> workloads(num_windows);
+            for(unsigned int t = 0; t < num_windows; ++t)
+            {
+                //Capture 't' by copy, all the other variables by reference:
+                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info)
+                {
+                    Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
+                    win.validate();
+
+                    if(tensors.empty())
+                    {
+                        kernel->run(win, info);
+                    }
+                    else
+                    {
+                        kernel->run_op(tensors, win, info);
+                    }
+                };
+            }
+            run_workloads(workloads);
+        }
+    }
+#endif /* !BARE_METAL */
+}
+
 void IScheduler::run_tagged_workloads(std::vector<Workload> &workloads, const char *tag)
 {
     ARM_COMPUTE_UNUSED(tag);
diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp
index 75068b15c9..a13b29b572 100644
--- a/src/runtime/NEON/INEOperator.cpp
+++ b/src/runtime/NEON/INEOperator.cpp
@@ -22,12 +22,16 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/INEOperator.h"
+#include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
 namespace experimental
 {
+INEOperator::~INEOperator() = default;
+
 INEOperator::INEOperator(IRuntimeContext *ctx)
     : _kernel(), _ctx(ctx), _workspace()
 {
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index cef2762e37..5438bce62a 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
 
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+INESimpleFunction::~INESimpleFunction() = default;
 
 INESimpleFunction::INESimpleFunction() // NOLINT
     : _kernel(),
@@ -35,6 +40,7 @@ INESimpleFunction::INESimpleFunction() // NOLINT
 
 void INESimpleFunction::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
     NEScheduler::get().schedule(_kernel.get(), Window::DimY);
 }
+} //namespace arm_compute
diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
index 82316c49c6..21dd58e378 100644
--- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,11 +23,15 @@
  */
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
+#include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/Utils.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/runtime/Utils.h"
 
 namespace arm_compute
 {
+INESimpleFunctionNoBorder::~INESimpleFunctionNoBorder() = default;
+
 INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx)
     : _kernel(),
       _ctx(ctx)
@@ -36,6 +40,6 @@ INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx)
 
 void INESimpleFunctionNoBorder::run()
 {
-    schedule_kernel_on_ctx(_ctx, _kernel.get(), Window::DimY);
+    utils::schedule_kernel_on_ctx(_ctx, _kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
index ec27820126..df2bc7d72e 100644
--- a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
+++ b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
 
-#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+#include "src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEAbsoluteDifference::~NEAbsoluteDifference() = default;
 
 void NEAbsoluteDifference::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
@@ -36,3 +38,4 @@ void NEAbsoluteDifference::configure(const ITensor *input1, const ITensor *input
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEAccumulate.cpp b/src/runtime/NEON/functions/NEAccumulate.cpp
index 662f8ccb5b..20eefd9d2d 100644
--- a/src/runtime/NEON/functions/NEAccumulate.cpp
+++ b/src/runtime/NEON/functions/NEAccumulate.cpp
@@ -23,12 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
 
-#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+#include "src/core/NEON/kernels/NEAccumulateKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEAccumulate::~NEAccumulate() = default;
 
 void NEAccumulate::configure(const ITensor *input, ITensor *output)
 {
@@ -37,6 +39,8 @@ void NEAccumulate::configure(const ITensor *input, ITensor *output)
     _kernel = std::move(k);
 }
 
+NEAccumulateWeighted::~NEAccumulateWeighted() = default;
+
 void NEAccumulateWeighted::configure(const ITensor *input, float alpha, ITensor *output, bool use_fp16)
 {
     if(use_fp16)
@@ -53,9 +57,12 @@ void NEAccumulateWeighted::configure(const ITensor *input, float alpha, ITensor
     }
 }
 
+NEAccumulateSquared::~NEAccumulateSquared() = default;
+
 void NEAccumulateSquared::configure(const ITensor *input, uint32_t shift, ITensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEAccumulateSquaredKernel>();
     k->configure(input, shift, output);
     _kernel = std::move(k);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index 7f55edbf70..f9ad298e4d 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -24,16 +24,18 @@
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NEActivationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 namespace experimental
 {
+NEActivationLayer::~NEActivationLayer() = default;
+
 void NEActivationLayer::configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernel>();
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index 0664d3c9d5..2a9bb76c7f 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,9 +29,14 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEArgMinMaxLayer::~NEArgMinMaxLayer() = default;
+
 NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _reduction_function(support::cpp14::make_unique<NEReductionOperation>())
 {
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 4453a015e8..0bf9a09333 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "src/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -33,6 +33,8 @@ namespace arm_compute
 {
 namespace experimental
 {
+NEArithmeticAddition::~NEArithmeticAddition() = default;
+
 void NEArithmeticAddition::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 1c95bbfae8..ba3f426269 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "src/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index 5a593e9c74..d0fdfcf101 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 
-using namespace arm_compute;
+#include "support/MemorySupport.h"
+
+namespace arm_compute
+{
+NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default;
 
 NEBatchNormalizationLayer::NEBatchNormalizationLayer()
     : _norm_kernel()
@@ -41,7 +46,8 @@ void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const
                                           ActivationLayerInfo act_info)
 {
     // Configure kernel
-    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info);
+    _norm_kernel = arm_compute::support::cpp14::make_unique<NEBatchNormalizationLayerKernel>();
+    _norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
 Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma,
@@ -53,5 +59,6 @@ Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITens
 
 void NEBatchNormalizationLayer::run()
 {
-    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+    NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
index c06a8aa0e0..77a63c0f63 100644
--- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
index 1d89308565..f3b5220ccf 100644
--- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
 
-#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
index 585b059005..036584ea1a 100644
--- a/src/runtime/NEON/functions/NEBitwiseNot.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
 
-#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
index bba866d97a..fc905a0919 100644
--- a/src/runtime/NEON/functions/NEBitwiseOr.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
 
-#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
index 188fe3d9ef..301a0c4659 100644
--- a/src/runtime/NEON/functions/NEBitwiseXor.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
 
-#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
index b1ecfaf314..0b639430b1 100644
--- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
+++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
+#include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
 
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/NEON/functions/NEBox3x3.cpp b/src/runtime/NEON/functions/NEBox3x3.cpp
index a380377daa..01d2356a4c 100644
--- a/src/runtime/NEON/functions/NEBox3x3.cpp
+++ b/src/runtime/NEON/functions/NEBox3x3.cpp
@@ -23,14 +23,15 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
 
-#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEBox3x3Kernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEBox3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
 {
     if(use_fp16)
@@ -45,5 +46,8 @@ void NEBox3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode
         k->configure(input, output, border_mode == BorderMode::UNDEFINED);
         _kernel = std::move(k);
     }
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index d7ec52c5ac..bf4f7d7933 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -25,8 +25,6 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
@@ -34,13 +32,19 @@
 #include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
 #include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "src/core/NEON/kernels/NESobel7x7Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <cstring>
 #include <inttypes.h>
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NECannyEdge::~NECannyEdge() = default;
 
 NECannyEdge::NECannyEdge(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
@@ -139,21 +143,25 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr,
     _memory_group.manage(&_nonmax);
 
     // Configure non-maxima suppression
-    _non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
+    _non_max_suppr = arm_compute::support::cpp14::make_unique<NEEdgeNonMaxSuppressionKernel>();
+    _non_max_suppr->configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
 
     // Fill border around magnitude image as non-maxima suppression will access
     // it. If border mode is undefined filling the border is a nop.
-    _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value);
+    _border_mag_gradient = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    _border_mag_gradient->configure(&_magnitude, _non_max_suppr->border_size(), border_mode, constant_border_value);
 
     // Allocate intermediate tensors
     _phase.allocator()->allocate();
     _magnitude.allocator()->allocate();
 
     // Configure edge tracing
-    _edge_trace.configure(&_nonmax, output);
+    _edge_trace = arm_compute::support::cpp14::make_unique<NEEdgeTraceKernel>();
+    _edge_trace->configure(&_nonmax, output);
 
     // Fill border with "No edge" to stop recursion in edge trace
-    _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
+    _border_edge_trace = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    _border_edge_trace->configure(&_nonmax, _edge_trace->border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
 
     // Allocate intermediate tensors
     _nonmax.allocator()->allocate();
@@ -172,17 +180,18 @@ void NECannyEdge::run()
     NEScheduler::get().schedule(_gradient.get(), Window::DimY);
 
     // Fill border before non-maxima suppression. Nop for border mode undefined.
-    NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
+    NEScheduler::get().schedule(_border_mag_gradient.get(), Window::DimZ);
 
     // Run non-maxima suppression
-    NEScheduler::get().schedule(&_non_max_suppr, Window::DimY);
+    NEScheduler::get().schedule(_non_max_suppr.get(), Window::DimY);
 
     ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
     std::fill_n(_output->buffer(), _output->info()->total_size(), 0);
 
     // Fill border before edge trace
-    NEScheduler::get().schedule(&_border_edge_trace, Window::DimZ);
+    NEScheduler::get().schedule(_border_edge_trace.get(), Window::DimZ);
 
     // Run edge tracing
-    NEScheduler::get().schedule(&_edge_trace, Window::DimY);
+    NEScheduler::get().schedule(_edge_trace.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp
index 4b35110417..7fd2605fd2 100644
--- a/src/runtime/NEON/functions/NECast.cpp
+++ b/src/runtime/NEON/functions/NECast.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEChannelCombine.cpp b/src/runtime/NEON/functions/NEChannelCombine.cpp
index e987951097..f8a9be0313 100644
--- a/src/runtime/NEON/functions/NEChannelCombine.cpp
+++ b/src/runtime/NEON/functions/NEChannelCombine.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
 
-#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+#include "src/core/NEON/kernels/NEChannelCombineKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEChannelExtract.cpp b/src/runtime/NEON/functions/NEChannelExtract.cpp
index d78a8f8301..8f5e4d47d9 100644
--- a/src/runtime/NEON/functions/NEChannelExtract.cpp
+++ b/src/runtime/NEON/functions/NEChannelExtract.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
 
-#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+#include "src/core/NEON/kernels/NEChannelExtractKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
index 0392a92663..c72dec67ee 100644
--- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
+++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECol2Im.cpp b/src/runtime/NEON/functions/NECol2Im.cpp
index e4fe36fd25..0706125157 100644
--- a/src/runtime/NEON/functions/NECol2Im.cpp
+++ b/src/runtime/NEON/functions/NECol2Im.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NECol2Im.h"
 
-#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "src/core/NEON/kernels/NECol2ImKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEColorConvert.cpp b/src/runtime/NEON/functions/NEColorConvert.cpp
index 7befac7aa3..ebdd1046ce 100644
--- a/src/runtime/NEON/functions/NEColorConvert.cpp
+++ b/src/runtime/NEON/functions/NEColorConvert.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
 
-#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+#include "src/core/NEON/kernels/NEColorConvertKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEComputeAllAnchors.cpp b/src/runtime/NEON/functions/NEComputeAllAnchors.cpp
index cb89117ff9..3f5712dd3a 100644
--- a/src/runtime/NEON/functions/NEComputeAllAnchors.cpp
+++ b/src/runtime/NEON/functions/NEComputeAllAnchors.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h"
 
+#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index 8df4f4cb62..03a01aec6b 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -23,10 +23,10 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
+#include "src/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
@@ -35,6 +35,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index f697efb367..291afe0273 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,13 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default;
+
 NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
     : _kernel()
 {
@@ -33,7 +37,8 @@ NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
 void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
                                                DataLayout data_layout)
 {
-    _kernel.configure(input, output, original_input_shape, data_layout);
+    _kernel = arm_compute::support::cpp14::make_unique<NEConvertFullyConnectedWeightsKernel>();
+    _kernel->configure(input, output, original_input_shape, data_layout);
 }
 
 Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
@@ -44,6 +49,6 @@ Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const
 
 void NEConvertFullyConnectedWeights::run()
 {
-    NEScheduler::get().schedule(&_kernel, Window::DimZ);
+    NEScheduler::get().schedule(_kernel.get(), Window::DimZ);
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index 8200a08ca8..07ac8bd42b 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -25,28 +25,38 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEConvolutionKernel.h"
+#include "src/core/NEON/kernels/NEConvolutionKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <array>
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEConvolution3x3::~NEConvolution3x3() = default;
 
 void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEConvolution3x3Kernel>();
     k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
 
+template <unsigned int matrix_size>
+NEConvolutionSquare<matrix_size>::~NEConvolutionSquare() = default;
+
 template <unsigned int matrix_size>
 NEConvolutionSquare<matrix_size>::NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
@@ -66,6 +76,7 @@ void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output
 
     _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
 
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
     if(_is_separable)
     {
         DataType intermediate_type = DataType::UNKNOWN;
@@ -82,35 +93,40 @@ void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output
             scale = calculate_matrix_scale(conv, matrix_size);
         }
 
-        _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
-        _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED);
+        _kernel_hor  = arm_compute::support::cpp14::make_unique<NESeparableConvolutionHorKernel<matrix_size>>();
+        _kernel_vert = arm_compute::support::cpp14::make_unique<NESeparableConvolutionVertKernel<matrix_size>>();
+
+        _kernel_hor->configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+        _kernel_vert->configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED);
 
         _tmp.allocator()->allocate();
 
-        _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+        b->configure(input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
     }
     else
     {
-        _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
-        _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
+        _kernel = arm_compute::support::cpp14::make_unique<NEConvolutionKernel<matrix_size>>();
+        _kernel->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+        b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
     }
+    _border_handler = std::move(b);
 }
 
 template <unsigned int matrix_size>
 void                   NEConvolutionSquare<matrix_size>::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 
     if(_is_separable)
     {
         MemoryGroupResourceScope scope_mg(_memory_group);
 
-        NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
-        NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+        NEScheduler::get().schedule(_kernel_hor.get(), Window::DimY);
+        NEScheduler::get().schedule(_kernel_vert.get(), Window::DimY);
     }
     else
     {
-        NEScheduler::get().schedule(&_kernel, Window::DimY);
+        NEScheduler::get().schedule(_kernel.get(), Window::DimY);
     }
 }
 
@@ -118,10 +134,16 @@ template class arm_compute::NEConvolutionSquare<5>;
 template class arm_compute::NEConvolutionSquare<7>;
 template class arm_compute::NEConvolutionSquare<9>;
 
+NEConvolutionRectangle::~NEConvolutionRectangle() = default;
+
 void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEConvolutionRectangleKernel>();
     k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 491425c487..cc5f160787 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -27,6 +27,12 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
+
 #include "support/MemorySupport.h"
 
 #include <cmath>
@@ -50,6 +56,7 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
     ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
                                                             enable_fast_math, num_groups));
 
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
     switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
@@ -66,6 +73,13 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
             _function = std::move(f);
             break;
         }
+        case ConvolutionMethod::GEMM_CONV2D:
+        {
+            auto f = arm_compute::support::cpp14::make_unique<NEGEMMConv2d>(_memory_manager);
+            f->configure(input, weights, biases, output, info);
+            _function = std::move(f);
+            break;
+        }
         case ConvolutionMethod::DIRECT:
         {
             auto f = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayer>(_memory_manager);
@@ -91,22 +105,22 @@ Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on NEON");
 
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
     switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
-            //Validate Winograd
             ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
             break;
         case ConvolutionMethod::GEMM:
-            //Validate Gemm-based Convolution
             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
             break;
+        case ConvolutionMethod::GEMM_CONV2D:
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConv2d::validate(input, weights, biases, output, info));
+            break;
         case ConvolutionMethod::DIRECT:
-            //Validate Direct Convolution
             ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
             break;
         case ConvolutionMethod::FFT:
-            // Validate FFT-based convolution layer
             ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
             break;
         default:
@@ -128,6 +142,8 @@ ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
     const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
 
+    const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1);
+
     /* Input spatial dims, kernel size, IFM/OFM, conv info*/
     using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
     using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
@@ -214,7 +230,21 @@ ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *
             }
         }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+        // For 1x1 convolutions run the default GEMM
+        if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
+        {
+            return ConvolutionMethod::GEMM;
+        }
+
+        if(bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
+        {
+            return ConvolutionMethod::WINOGRAD;
+        }
+        if(bool(NEGEMMConv2d::validate(input, weights, nullptr, output, info)))
+        {
+            return ConvolutionMethod::GEMM_CONV2D;
+        }
+        return ConvolutionMethod::GEMM;
     }
 }
 
diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
index a461c18894..9e7bf40559 100644
--- a/src/runtime/NEON/functions/NECopy.cpp
+++ b/src/runtime/NEON/functions/NECopy.cpp
@@ -23,13 +23,15 @@
  */
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
 namespace arm_compute
 {
+NECopy::~NECopy() = default;
+
 void NECopy::configure(ITensor *input, ITensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NECopyKernel>();
diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
index f6ed2ec250..2e2d2251b6 100644
--- a/src/runtime/NEON/functions/NECropResize.cpp
+++ b/src/runtime/NEON/functions/NECropResize.cpp
@@ -24,11 +24,16 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "arm_compute/runtime/NEON/functions/NECropResize.h"
+#include "src/core/NEON/kernels/NECropKernel.h"
+
+#include "support/MemorySupport.h"
 
 #include <cstddef>
 
 namespace arm_compute
 {
+NECropResize::~NECropResize() = default;
+
 NECropResize::NECropResize()
     : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results()
 {
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index dff3070239..2b5b0082c4 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -28,6 +28,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute::misc::shape_calculator;
 
diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 1ffcca0d7f..af0f5efb69 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
+#include "src/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index e363f89482..c4f15e3b68 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index cfdf2038b9..fc97279211 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -27,6 +27,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute::misc;
 using namespace arm_compute::misc::shape_calculator;
@@ -58,24 +60,7 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
     }
 
-    const bool is_quantized = (!is_data_type_quantized_per_channel(weights->data_type())) && is_data_type_quantized_asymmetric(input->data_type());
-
-    if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
-    {
-        TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier, dilation));
-
-        if(is_quantized)
-        {
-            DirectConvolutionLayerOutputStageKernelInfo direct_conv_info;
-            direct_conv_info.output_data_type = input->data_type();
-            ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output, direct_conv_info));
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
-    }
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
 
     //Validate Activation Layer
     if(act_info.enabled())
@@ -86,118 +71,36 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo
 }
 } // namespace
 
+NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
+
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
-      _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
-      _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
+    : _memory_group(memory_manager), _dwc_optimized_func(memory_manager), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _accumulator(), _permuted_input(),
+      _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_generic(ITensor                   *input,
-                                                                                                  const ITensor             *weights,
-                                                                                                  const ITensor             *biases,
-                                                                                                  ITensor                   *output,
-                                                                                                  const PadStrideInfo       &conv_info,
-                                                                                                  unsigned int               depth_multiplier,
-                                                                                                  const ActivationLayerInfo &act_info,
-                                                                                                  const Size2D              &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor       *input,
+                                                                                          const ITensor *weights,
+                                                                                          const ITensor *biases,
+                                                                                          ITensor *output, const PadStrideInfo &conv_info,
+                                                                                          unsigned int               depth_multiplier,
+                                                                                          const ActivationLayerInfo &act_info,
+                                                                                          const Size2D              &dilation)
 {
-    ARM_COMPUTE_UNUSED(act_info);
-
-    PixelValue zero_value(0.f);
-
-    // Initialize the intermediate accumulator tensor in case of quantized input
-    if(_is_quantized)
-    {
-        TensorShape accum_shape  = output->info()->tensor_shape();
-        DataLayout  accum_layout = output->info()->data_layout();
-        if(!_is_nchw)
-        {
-            permute(accum_shape, PermutationVector(1U, 2U, 0U));
-            accum_layout = DataLayout::NCHW;
-        }
-
-        _memory_group.manage(&_accumulator);
-        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
-        _accumulator.info()->set_data_layout(accum_layout);
-        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
-    }
-
-    if(!_is_nchw)
-    {
-        _memory_group.manage(&_permuted_input);
-        _memory_group.manage(&_permuted_output);
-
-        // Configure the function to transform the input tensor from NHWC -> NCHW
-        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
-        // Configure the function to transform the weights tensor from HWI -> IHW
-        _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
-        _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-        _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
-
-        // Configure depthwise
-        _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
-
-        // Configure border handler
-        _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-
-        // Allocate tensors
-        _permuted_input.allocator()->allocate();
-    }
-    else
-    {
-        // Configure depthwise convolution kernel
-        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
-
-        // Configure border handler
-        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
-    }
-
-    // Configure biases accumulation
-    if(_is_quantized)
-    {
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
-
-        float   multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
-        int32_t output_multiplier;
-        int32_t output_shift;
-        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
-
-        DirectConvolutionLayerOutputStageKernelInfo direct_conv_info;
-        direct_conv_info.result_fixedpoint_multiplier = output_multiplier;
-        direct_conv_info.result_shift                 = output_shift;
-        direct_conv_info.result_offset_after_shift    = oq_info.offset;
-        direct_conv_info.output_data_type             = input->info()->data_type();
-        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, direct_conv_info);
-        _accumulator.allocator()->allocate();
-    }
-    else if(_has_bias)
-    {
-        _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
-    }
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
+                                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));
 
-    // Permute output
-    if(!_is_nchw)
-    {
-        // Configure the function to transform the convoluted output to NHWC
-        _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
-        _permuted_output.allocator()->allocate();
-    }
-}
+    _original_weights           = weights;
+    _is_quantized               = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _has_bias                   = biases != nullptr;
+    _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
+    _permute                    = _is_nchw;
+    _is_prepared                = false;
+    _is_activationlayer_enabled = act_info.enabled();
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure_optimized(const ITensor             *input,
-                                                                                                    const ITensor             *weights,
-                                                                                                    const ITensor             *biases,
-                                                                                                    ITensor                   *output,
-                                                                                                    const PadStrideInfo       &conv_info,
-                                                                                                    unsigned int               depth_multiplier,
-                                                                                                    const ActivationLayerInfo &act_info,
-                                                                                                    const Size2D              &dilation)
-{
+    // Configure pipeline
     ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
     const bool          is_relu         = arm_compute::utils::info_helpers::is_relu(act_info);
     const bool          is_relu6        = arm_compute::utils::info_helpers::is_relu6(act_info);
@@ -238,43 +141,6 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     {
         _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation);
     }
-}
-
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor       *input,
-                                                                                          const ITensor *weights,
-                                                                                          const ITensor *biases,
-                                                                                          ITensor *output, const PadStrideInfo &conv_info,
-                                                                                          unsigned int               depth_multiplier,
-                                                                                          const ActivationLayerInfo &act_info,
-                                                                                          const Size2D              &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimizedInternal::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
-                                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));
-
-    _original_weights = weights;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _has_bias         = biases != nullptr;
-    _is_optimized     = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
-                                                                                       weights->info(),
-                                                                                       conv_info,
-                                                                                       depth_multiplier,
-                                                                                       dilation);
-    _is_nchw                    = input->info()->data_layout() == DataLayout::NCHW;
-    _permute                    = _is_optimized == _is_nchw;
-    _is_prepared                = false;
-    _is_activationlayer_enabled = act_info.enabled();
-
-    // Configure appropriate pipeline
-    if(_is_optimized)
-    {
-        configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-    }
-    else
-    {
-        configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
-    }
 
     // Configure activation
     if(_is_activationlayer_enabled)
@@ -295,29 +161,18 @@ Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal
     return validate_arguments_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_generic()
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
 {
-    // Fill border
-    NEScheduler::get().schedule(&_border_handler, Window::DimX);
-
-    // Execute depthwise convolution
-    NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
+    prepare();
 
-    // Add biases
-    if(_has_bias || _is_quantized)
-    {
-        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
-    }
+    MemoryGroupResourceScope scope_mg(_memory_group);
 
-    // Permute output
-    if(!_is_nchw)
+    // Permute input
+    if(_permute)
     {
-        _permute_output.run();
+        _permute_input.run();
     }
-}
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run_optimized()
-{
     // Run assembly function
     _dwc_optimized_func.run();
 
@@ -326,21 +181,6 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     {
         _permute_output.run();
     }
-}
-
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Permute input
-    if(_permute)
-    {
-        _permute_input.run();
-    }
-
-    _is_optimized ? run_optimized() : run_generic();
 
     // Run activation
     if(_is_activationlayer_enabled)
@@ -362,13 +202,10 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
         }
 
         // Prepare optimized function
-        if(_is_optimized)
+        _dwc_optimized_func.prepare();
+        if(!_permuted_weights.is_used())
         {
-            _dwc_optimized_func.prepare();
-            if(!_permuted_weights.is_used())
-            {
-                _permuted_weights.allocator()->free();
-            }
+            _permuted_weights.allocator()->free();
         }
 
         _is_prepared = true;
@@ -376,8 +213,8 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
 }
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
-    : _depthwise_conv_kernel(), _fill_border(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _permuted_input(), _permuted_weights(), _permuted_output(),
-      _is_prepared(false), _is_nchw(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
+    : _depthwise_conv_kernel(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _permuted_input(), _permuted_weights(), _permuted_output(), _is_prepared(false),
+      _is_nchw(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
 {
 }
 
@@ -409,8 +246,8 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
     }
     _original_weights = weights_to_use;
 
-    _depthwise_conv_kernel.configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, dilation);
-    _fill_border.configure(input_to_use, _depthwise_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()));
+    _depthwise_conv_kernel = arm_compute::support::cpp14::make_unique<NEDepthwiseConvolutionLayerNativeKernel>();
+    _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier, dilation);
 
     if(_is_nchw)
     {
@@ -476,8 +313,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
         _permute_input.run();
     }
 
-    NEScheduler::get().schedule(&_fill_border, Window::DimX);
-    NEScheduler::get().schedule(&_depthwise_conv_kernel, Window::DimY);
+    NEScheduler::get().schedule(_depthwise_conv_kernel.get(), Window::DimY);
 
     if(_is_nchw)
     {
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index a4a3a43b2e..0c0f86c82b 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -24,7 +24,7 @@
 
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
+#include "src/core/NEON/kernels/NEDequantizationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
index 24991400b8..f007e9fda3 100644
--- a/src/runtime/NEON/functions/NEDerivative.cpp
+++ b/src/runtime/NEON/functions/NEDerivative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,12 +24,16 @@
 #include "arm_compute/runtime/NEON/functions/NEDerivative.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEDerivativeKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEDerivative::~NEDerivative() = default;
 
 NEDerivative::NEDerivative()
     : _kernel(), _border_handler()
@@ -41,12 +45,16 @@ void NEDerivative::configure(ITensor *input, ITensor *output_x, ITensor *output_
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
 
-    _kernel.configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
-    _border_handler.configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
+    _kernel         = arm_compute::support::cpp14::make_unique<NEDerivativeKernel>();
+    _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+
+    _kernel->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _border_handler->configure(input, BorderSize(1), border_mode, PixelValue(constant_border_value));
 }
 
 void NEDerivative::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
-    NEScheduler::get().schedule(&_kernel, Window::DimY);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
+    NEScheduler::get().schedule(_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDilate.cpp b/src/runtime/NEON/functions/NEDilate.cpp
index 7f503865b4..70c0b61639 100644
--- a/src/runtime/NEON/functions/NEDilate.cpp
+++ b/src/runtime/NEON/functions/NEDilate.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEDilate.h"
 
-#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEDilateKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -36,5 +37,8 @@ void NEDilate::configure(ITensor *input, ITensor *output, BorderMode border_mode
     auto k = arm_compute::support::cpp14::make_unique<NEDilateKernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index da7e771aaf..98d6386ffe 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -27,21 +27,27 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-
-#include <cmath>
-#include <tuple>
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default;
+
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
-      _is_activationlayer_enabled(false), _dim_split(Window::DimZ)
+      _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
 {
 }
 
 void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
+    _output_stage_kernel  = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayerOutputStageKernel>();
+    _conv_kernel          = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayerKernel>();
+    _input_border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
 
     // Free accumulator
     if(_accumulator.buffer() != nullptr)
@@ -54,14 +60,18 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights,
     // Check if bias should be added in the convolution result
     _has_bias = (bias != nullptr);
 
-    _conv_kernel.configure(input, weights, output, conv_info);
+    _conv_kernel->configure(input, weights, output, conv_info);
     if(_has_bias)
     {
-        _output_stage_kernel.configure(output, bias);
+        _output_stage_kernel->configure(output, bias);
     }
+    _is_padding_required = !_conv_kernel->border_size().empty();
 
-    // Add zero padding XY
-    _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+    if(_is_padding_required)
+    {
+        // Add zero padding XY
+        _input_border_handler->configure(input, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+    }
 
     //Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
@@ -104,14 +114,16 @@ Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITenso
 
 void NEDirectConvolutionLayer::run()
 {
-    NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
-
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    NEScheduler::get().schedule(&_conv_kernel, _dim_split);
+    if(_is_padding_required)
+    {
+        NEScheduler::get().schedule(_input_border_handler.get(), Window::DimZ);
+    }
+    NEScheduler::get().schedule(_conv_kernel.get(), _dim_split);
     if(_has_bias)
     {
-        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimY);
+        NEScheduler::get().schedule(_output_stage_kernel.get(), Window::DimY);
     }
 
     if(_is_activationlayer_enabled)
diff --git a/src/runtime/NEON/functions/NEElementwiseOperators.cpp b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
index d1f60c71e1..7f3fe8b30b 100644
--- a/src/runtime/NEON/functions/NEElementwiseOperators.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
-#include <arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h>
+#include <src/core/NEON/kernels/NEElementwiseOperationKernel.h>
 
 #include "arm_compute/core/ITensor.h"
 #include "support/MemorySupport.h"
diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
index cb4e3a0b7d..5e130205d2 100644
--- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
+#include "src/core/NEON/kernels/NEElementwiseUnaryKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
index b3d5ad484f..d3ff171323 100644
--- a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,15 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NECumulativeDistributionKernel.h"
+#include "src/core/NEON/kernels/NEHistogramKernel.h"
+#include "src/core/NEON/kernels/NEHistogramKernel.h"
+#include "src/core/NEON/kernels/NETableLookupKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEEqualizeHistogram::~NEEqualizeHistogram() = default;
 
 NEEqualizeHistogram::NEEqualizeHistogram()
     : _histogram_kernel(), _cd_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8)
@@ -43,20 +50,25 @@ void NEEqualizeHistogram::configure(const IImage *input, IImage *output)
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
 
+    _histogram_kernel     = arm_compute::support::cpp14::make_unique<NEHistogramKernel>();
+    _cd_histogram_kernel  = arm_compute::support::cpp14::make_unique<NECumulativeDistributionKernel>();
+    _map_histogram_kernel = arm_compute::support::cpp14::make_unique<NETableLookupKernel>();
+
     // Configure kernels
-    _histogram_kernel.configure(input, &_hist);
-    _cd_histogram_kernel.configure(input, &_hist, &_cum_dist, &_cd_lut);
-    _map_histogram_kernel.configure(input, &_cd_lut, output);
+    _histogram_kernel->configure(input, &_hist);
+    _cd_histogram_kernel->configure(input, &_hist, &_cum_dist, &_cd_lut);
+    _map_histogram_kernel->configure(input, &_cd_lut, output);
 }
 
 void NEEqualizeHistogram::run()
 {
     // Calculate histogram of input.
-    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
+    NEScheduler::get().schedule(_histogram_kernel.get(), Window::DimY);
 
     // Calculate cumulative distribution of histogram and create LUT.
-    NEScheduler::get().schedule(&_cd_histogram_kernel, Window::DimY);
+    NEScheduler::get().schedule(_cd_histogram_kernel.get(), Window::DimY);
 
     // Map input to output using created LUT.
-    NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);
+    NEScheduler::get().schedule(_map_histogram_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEErode.cpp b/src/runtime/NEON/functions/NEErode.cpp
index a89993c1fe..748694fe3f 100644
--- a/src/runtime/NEON/functions/NEErode.cpp
+++ b/src/runtime/NEON/functions/NEErode.cpp
@@ -23,18 +23,23 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEErode.h"
 
-#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEErodeKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEErode::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEErodeKernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
index 744a91521f..b94c25832a 100644
--- a/src/runtime/NEON/functions/NEFFT1D.cpp
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,11 +25,17 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
+#include "src/core/utils/helpers/fft.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEFFT1D::~NEFFT1D() = default;
+
 NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false)
 {
@@ -58,7 +64,8 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
     TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
     _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
     _memory_group.manage(&_digit_reversed_input);
-    _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+    _digit_reverse_kernel = arm_compute::support::cpp14::make_unique<NEFFTDigitReverseKernel>();
+    _digit_reverse_kernel->configure(input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
 
     // Create and configure FFT kernels
     unsigned int Nx = 1;
@@ -75,7 +82,8 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
         fft_kernel_info.radix          = radix_for_stage;
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
-        _fft_kernels[i].configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels[i]                = arm_compute::support::cpp14::make_unique<NEFFTRadixStageKernel>();
+        _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
@@ -86,7 +94,8 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
-        is_c2r ? _scale_kernel.configure(&_digit_reversed_input, output, scale_config) : _scale_kernel.configure(output, nullptr, scale_config);
+        _scale_kernel          = arm_compute::support::cpp14::make_unique<NEFFTScaleKernel>();
+        is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -128,17 +137,17 @@ void NEFFT1D::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    NEScheduler::get().schedule(&_digit_reverse_kernel, (_axis == 0 ? Window::DimY : Window::DimZ));
+    NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ));
 
     for(unsigned int i = 0; i < _num_ffts; ++i)
     {
-        NEScheduler::get().schedule(&_fft_kernels[i], (_axis == 0 ? Window::DimY : Window::DimX));
+        NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX));
     }
 
     // Run output scaling
     if(_run_scale)
     {
-        NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
+        NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
index b63afe59c0..3b787cd523 100644
--- a/src/runtime/NEON/functions/NEFFT2D.cpp
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp
@@ -26,9 +26,14 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Scheduler.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
 
 namespace arm_compute
 {
+NEFFT2D::~NEFFT2D() = default;
+
 NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
 {
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
index cd68788145..23788b7c39 100644
--- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,17 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/helpers/fft.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
+#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
+#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/helpers/fft.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -93,6 +102,7 @@ NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem
       _is_prepared(false)
 {
 }
+NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default;
 
 void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
                                       const ActivationLayerInfo &act_info)
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 303c593f84..1bde3cc508 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,15 +25,21 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Array.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFastCornersKernel.h"
+#include "src/core/NEON/kernels/NEFillArrayKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEFastCorners::~NEFastCorners() = default;
 
 NEFastCorners::NEFastCorners(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
@@ -62,24 +68,28 @@ void NEFastCorners::configure(IImage *input, float threshold, bool nonmax_suppre
     _output.allocator()->init(tensor_info);
     _memory_group.manage(&_output);
 
+    _fast_corners_kernel = arm_compute::support::cpp14::make_unique<NEFastCornersKernel>();
+    _border_handler      = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    _fill_kernel         = arm_compute::support::cpp14::make_unique<NEFillArrayKernel>();
     // If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3,
     // width - 3) and ywindow (3, height -3) so the output image will leave the
     // pixels on the borders unchanged. This is reflected in the valid region
     // of the output. The non maxima suppression is only run on the valid
     // pixels.
-    _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, BorderMode::UNDEFINED == border_mode);
-    _border_handler.configure(input, _fast_corners_kernel.border_size(), border_mode, constant_border_value);
+    _fast_corners_kernel->configure(input, &_output, threshold, nonmax_suppression, BorderMode::UNDEFINED == border_mode);
+    _border_handler->configure(input, _fast_corners_kernel->border_size(), border_mode, constant_border_value);
 
     if(!_non_max)
     {
-        _fill_kernel.configure(&_output, 1 /* we keep all texels >0 */, corners);
+        _fill_kernel->configure(&_output, 1 /* we keep all texels >0 */, corners);
     }
     else
     {
         _suppressed.allocator()->init(tensor_info);
         _memory_group.manage(&_suppressed);
-        _nonmax_kernel.configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
-        _fill_kernel.configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
+        _nonmax_kernel = arm_compute::support::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
+        _nonmax_kernel->configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
+        _fill_kernel->configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
 
         // Allocate intermediate tensors
         _suppressed.allocator()->allocate();
@@ -91,16 +101,17 @@ void NEFastCorners::configure(IImage *input, float threshold, bool nonmax_suppre
 
 void NEFastCorners::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
+    NEScheduler::get().schedule(_fast_corners_kernel.get(), Window::DimY);
 
     if(_non_max)
     {
-        NEScheduler::get().schedule(&_nonmax_kernel, Window::DimY);
+        NEScheduler::get().schedule(_nonmax_kernel.get(), Window::DimY);
     }
 
-    NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
+    NEScheduler::get().schedule(_fill_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp
index 79fe175e69..68292c9ee0 100644
--- a/src/runtime/NEON/functions/NEFill.cpp
+++ b/src/runtime/NEON/functions/NEFill.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEMemsetKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index de2ef26b80..e96069f97c 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,16 +25,19 @@
 
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    _border_handler.configure(input, BorderSize(border_width), border_mode, constant_border_value);
+    _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    _border_handler->configure(input, BorderSize(border_width), border_mode, constant_border_value);
 }
 
 void NEFillBorder::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index 936a70dacc..4dfe96325e 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
 #include "arm_compute/core/Size2D.h"
+#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
index 95b2497ded..5f6bd61017 100644
--- a/src/runtime/NEON/functions/NEFloor.cpp
+++ b/src/runtime/NEON/functions/NEFloor.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 
-#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
+#include "src/core/NEON/kernels/NEFloorKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 4dcf41e360..714fa58a66 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -29,6 +29,21 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
+
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 #include <cmath>
@@ -143,6 +158,8 @@ Status NEFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, c
     return NETransposeKernel::validate(input, output);
 }
 
+NEFullyConnectedLayer::~NEFullyConnectedLayer() = default;
+
 NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(std::move(memory_manager)), _weights_manager(weights_manager), _flatten_kernel(), _convert_weights(), _convert_weights_managed(), _reshape_weights_function(),
       _reshape_weights_managed_function(), _mm_gemm(nullptr, weights_manager), _mm_gemmlowp(nullptr, weights_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(),
@@ -197,7 +214,9 @@ void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITenso
 
     // Configure flatten kernel
     _memory_group.manage(&_flatten_output);
-    _flatten_kernel.configure(input, &_flatten_output);
+
+    _flatten_kernel = arm_compute::support::cpp14::make_unique<NEFlattenLayerKernel>();
+    _flatten_kernel->configure(input, &_flatten_output);
 
     // Configure matrix multiply kernel
     configure_mm(&_flatten_output, weights, biases, output, act);
@@ -396,7 +415,7 @@ void NEFullyConnectedLayer::run()
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
     {
-        NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+        NEScheduler::get().schedule(_flatten_kernel.get(), Window::DimY);
     }
 
     // Run matrix multiply
diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
index fd26bb49a7..c64fde050e 100644
--- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,9 +28,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEFuseBatchNormalization::~NEFuseBatchNormalization() = default;
+
 NEFuseBatchNormalization::NEFuseBatchNormalization()
     : _fuse_bn_kernel()
 {
@@ -41,7 +45,8 @@ void NEFuseBatchNormalization::configure(const ITensor *input_weights, const ITe
                                          const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
                                          float epsilon, FuseBatchNormalizationType fbn_type)
 {
-    _fuse_bn_kernel.configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    _fuse_bn_kernel = arm_compute::support::cpp14::make_unique<NEFuseBatchNormalizationKernel>();
+    _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
@@ -54,6 +59,6 @@ Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, cons
 
 void NEFuseBatchNormalization::run()
 {
-    NEScheduler::get().schedule(&_fuse_bn_kernel, Window::DimY);
+    NEScheduler::get().schedule(_fuse_bn_kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 3b8ca44ed7..9f52e458d2 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -23,7 +23,6 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -34,6 +33,13 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "support/MemorySupport.h"
 
 #include <cmath>
 
@@ -41,6 +47,20 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
 {
+namespace
+{
+AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+{
+    AsmGemmInfo asm_info;
+    asm_info.method                  = AsmConvMethod::Im2Col;
+    asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+    asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
+    asm_info.activation_info         = info.activation_info();
+
+    return asm_info;
+}
+} // namespace
+
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(),
       _alpha_scale_func(nullptr), _add_bias(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false),
@@ -48,12 +68,15 @@ NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *
 {
 }
 
+NEGEMM::~NEGEMM() = default;
+
 void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
 
-    const bool is_c_bias     = gemm_info.reshape_b_only_on_first_run();
-    bool       run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), gemm_info));
+    const AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
+    const bool        is_c_bias     = gemm_info.reshape_b_only_on_first_run();
+    bool              run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), asm_info));
 
     // Check if we need to reshape the matrix B only on the first run
     _is_prepared                      = false;
@@ -68,7 +91,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
     if(run_optimised)
     {
         const ITensor *c_to_use = is_c_bias ? c : nullptr;
-        _asm_glue.configure(a, b, c_to_use, d, gemm_info);
+        _asm_glue.configure(a, b, c_to_use, d, asm_info);
         ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured());
 
         // Scale product by alpha
@@ -87,11 +110,13 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
             _memory_group.manage(&_tmp_d);
         }
 
+        _mm_kernel = arm_compute::support::cpp14::make_unique<NEGEMMMatrixMultiplyKernel>();
+
         // Select between GEMV and GEMM
         if(_run_vector_matrix_multiplication)
         {
             // Configure the matrix multiply kernel
-            _mm_kernel.configure(a, b, gemm_output_to_use, alpha, false);
+            _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
         }
         else
         {
@@ -123,13 +148,15 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
             int k = a->info()->dimension(0);
 
             // Configure interleave kernel
-            _interleave_kernel.configure(a, &_tmp_a);
+            _interleave_kernel = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+            _interleave_kernel->configure(a, &_tmp_a);
 
             // Configure transpose kernel
-            _transpose_kernel.configure(b, &_tmp_b);
+            _transpose_kernel = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+            _transpose_kernel->configure(b, &_tmp_b);
 
             // Configure matrix multiplication kernel
-            _mm_kernel.configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
+            _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
 
             // Allocate once the all configure methods have been called
             _tmp_a.allocator()->allocate();
@@ -149,7 +176,8 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
     // Configure matrix addition kernel
     if(_run_addition)
     {
-        _ma_kernel.configure(c, d, beta);
+        _ma_kernel = arm_compute::support::cpp14::make_unique<NEGEMMMatrixAdditionKernel>();
+        _ma_kernel->configure(c, d, beta);
     }
 
     // Configure activation
@@ -208,7 +236,8 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     }
 
     // Check if we need to run the optimized assembly kernel
-    const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, gemm_info));
+    AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
+    const bool  run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, asm_info));
 
     if(!run_optimised)
     {
@@ -297,16 +326,16 @@ void NEGEMM::run()
         if(!_run_vector_matrix_multiplication)
         {
             // Run interleave kernel
-            NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+            NEScheduler::get().schedule(_interleave_kernel.get(), Window::DimY);
 
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+                NEScheduler::get().schedule(_transpose_kernel.get(), Window::DimY);
             }
         }
 
-        NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
+        NEScheduler::get().schedule(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
 
         // Run bias addition kernel
         if(_run_bias_addition)
@@ -318,7 +347,7 @@ void NEGEMM::run()
     // Run matrix addition kernel
     if(_run_addition)
     {
-        NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+        NEScheduler::get().schedule(_ma_kernel.get(), Window::DimY);
     }
 
     // Run activation function
@@ -354,7 +383,7 @@ void NEGEMM::prepare()
             }
 
             _tmp_b.allocator()->allocate();
-            NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+            NEScheduler::get().schedule(_transpose_kernel.get(), Window::DimY);
             if(!original_b_managed_by_weights_manager)
             {
                 _original_b->mark_as_unused();
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 3b9dde2bf7..f6739ee925 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -23,20 +23,72 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 
-#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
-
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
-
+#include "src/core/CPP/Validate.h"
 #include "src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h"
+#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
+
+#include "support/MemorySupport.h"
 
 #include <arm_neon.h>
+#include <cstdlib>
 
 namespace arm_compute
 {
 namespace
 {
+struct free_delete
+{
+    void operator()(void *x)
+    {
+        free(x);
+    }
+};
+
+struct Params
+{
+    unsigned int M;
+    unsigned int N;
+    unsigned int K;
+    unsigned int batches;
+    unsigned int multis;
+    unsigned int sections;
+    bool         indirect;
+};
+
+Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *d, const AsmGemmInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+
+    Params p;
+    p.M        = d->info()->tensor_shape().y();
+    p.K        = a->info()->tensor_shape().x();
+    p.N        = d->info()->tensor_shape().x();
+    p.multis   = 1;
+    p.indirect = false;
+    p.sections = 1;
+
+    if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
+    {
+        p.indirect = true;
+        p.sections = b->info()->tensor_shape()[2] * b->info()->tensor_shape()[3];
+    }
+    else
+    {
+        p.multis  = b->info()->tensor_shape().z();
+        p.batches = d->info()->tensor_shape().total_size_upper(2) / p.multis; //COMPMID-1423: Agree on and document the layout of gemm inputs/outputs
+    }
+
+    // Update M in case of GEMM3D for output
+    if(info.depth_output_gemm3d != 0)
+    {
+        p.M       = d->info()->tensor_shape().y() * d->info()->tensor_shape().z();
+        p.batches = d->info()->tensor_shape().total_size_upper(3) / p.multis;
+    }
+
+    return p;
+}
+
 arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
 {
     arm_gemm::Activation gemm_act;
@@ -69,6 +121,29 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
     return gemm_act;
 }
 
+IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type)
+{
+    // Schedule assembly kernel
+    const int         granule_threshold = 200;
+    IScheduler::Hints scheduling_hint   = IScheduler::Hints(Window::DimX);
+    if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
+    {
+        scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
+    }
+    else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8))
+    {
+        //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
+        scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+    }
+    else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
+    {
+        //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
+        scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+    }
+
+    return scheduling_hint;
+}
+
 template <typename TypeInput, typename TypeOutput>
 class FallbackTransform : public ITransformWeights
 {
@@ -165,7 +240,7 @@ class Fallback : public NEGEMMAssemblyDispatch::IFallback
      * @param[in]  os              Output stage meta-data.
      */
     void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
-                   arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
+                   arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
                    MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {});
 
     /** Set requantization shifts to be used
@@ -182,8 +257,8 @@ class Fallback : public NEGEMMAssemblyDispatch::IFallback
       *
       * @return A tuple with the pointers to the shift and multiplier data respectively
       */
-    std::tuple<const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
-                                                                     const std::vector<int32_t> &multipliers);
+    std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
+                                                                                            const std::vector<int32_t> &multipliers);
 
     // Inherited methods overridden:
     void run() override;
@@ -198,6 +273,16 @@ class Fallback : public NEGEMMAssemblyDispatch::IFallback
      * @param[in] alignment      Workspace memory alignment.
      */
     void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment);
+    /** Configure the indirect buffer
+     *
+     * @param[in]  a    Input tensor containing the Matrix A.
+     * @param[in]  b    Input tensor containing the Matrix B.
+     * @param[out] d    Output tensor to store the result of matrix multiplication.
+     * @param[in]  info GEMM meta-data
+     */
+    void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info);
+    /** Prepare the indirect buffer */
+    void prepare_indirect_buffer();
 
     /** Assembly Gemm kernel */
     std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
@@ -226,7 +311,7 @@ class Fallback : public NEGEMMAssemblyDispatch::IFallback
     /** Prepared flag */
     bool _is_prepared{ false };
     /** GEMM meta-data */
-    GEMMInfo _gemm_info{};
+    AsmGemmInfo _gemm_info{};
     /** Weights manager */
     IWeightsManager *_weights_manager{ nullptr };
     /** Weights transform object */
@@ -235,23 +320,153 @@ class Fallback : public NEGEMMAssemblyDispatch::IFallback
     arm_gemm::KernelDescription _kernel_info{};
     /** Per channel quantization shifts */
     std::vector<int32_t> _shifts{};
+    std::vector<int32_t> right_shifts{};
+    std::vector<int32_t> left_shifts{};
     /** Per channel quantization multipliers */
     std::vector<int32_t> _multipliers{};
+    /** Indirect buffer */
+    std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
+    std::unique_ptr<const TypeInput *, free_delete>        _indirect_buf{};
+    std::vector<TypeInput>          _indirect_pad{};
+    arm_gemm::ConvolutionParameters _cp{};
 };
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-std::tuple<const int32_t *, const int32_t *> Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts,
-                                                                                                               const std::vector<int32_t> &multipliers)
+std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
+Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers)
+{
+    _multipliers   = multipliers;
+    _shifts        = shifts;
+    bool need_left = false;
+    for(const auto s : _shifts)
+    {
+        left_shifts.push_back(std::max(-s, int32_t(0)));
+        right_shifts.push_back(std::min(-s, int32_t(0)));
+        if(s < 0 && !need_left)
+        {
+            need_left = true;
+        }
+    }
+    return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data());
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer()
+{
+    const TypeInput *A_ptr          = reinterpret_cast<TypeInput *>(_a->buffer());
+    const int        multis         = 1;
+    const int        batches        = _a->info()->tensor_shape().total_size_upper(3);
+    const size_t     stride_A       = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+    const size_t     batch_stride_A = _a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
+    const size_t     multi_stride_A = _a->info()->strides_in_bytes()[4] / sizeof(TypeInput);
+
+    const size_t output_hw    = _cp.output_height * _cp.output_width;
+    const int    batch_size   = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput);
+    const size_t batch_stride = batch_size / sizeof(TypeInput);
+    const int    multi_size   = batch_size * batches;
+    const size_t multi_stride = multi_size / sizeof(TypeInput);
+
+    for(int64_t m = 0; m < multis; m++)
+    {
+        for(int64_t b = 0; b < batches; b++)
+        {
+            for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
+            {
+                for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
+                {
+                    int64_t output_xy = (output_y * _cp.output_width) + output_x;
+
+                    for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
+                    {
+                        for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
+                        {
+                            int64_t input_x   = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
+                            int64_t input_y   = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
+                            int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
+                            int64_t input_xy  = (input_y * _cp.input_width) + input_x;
+
+                            if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
+                            {
+                                _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
+                            }
+                            else
+                            {
+                                _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+                                    A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
 {
-    _multipliers = multipliers;
-    _shifts      = shifts;
-    std::transform(_shifts.begin(), _shifts.end(), _shifts.begin(), std::negate<int32_t>());
-    return std::make_tuple(_shifts.data(), _multipliers.data());
+    ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect));
+
+    float zeropad = 0.f;
+    if(is_data_type_quantized(a->data_type()))
+    {
+        zeropad = a->quantization_info().uniform().offset;
+    }
+
+    const int64_t input_width    = static_cast<int64_t>(a->tensor_shape()[1]);
+    const int64_t input_height   = static_cast<int64_t>(a->tensor_shape()[2]);
+    const int64_t input_channels = static_cast<int64_t>(a->tensor_shape()[0]);
+    const int64_t kernel_width   = static_cast<int64_t>(b->tensor_shape()[2]);
+    const int64_t kernel_height  = static_cast<int64_t>(b->tensor_shape()[3]);
+    const int64_t output_width   = static_cast<int64_t>(d->tensor_shape()[1]);
+    const int64_t output_height  = static_cast<int64_t>(d->tensor_shape()[2]);
+
+    _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height,
+            info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
+          };
+
+    if(info.method == AsmConvMethod::Conv)
+    {
+        _gemm_kernel_asm->set_convolution_parameters(_cp);
+    }
+
+    if(info.method == AsmConvMethod::Indirect)
+    {
+        const unsigned int multis    = 1;
+        const unsigned int batches   = a->tensor_shape().total_size_upper(3);
+        const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height;
+        const unsigned int output_hw = _cp.output_width * _cp.output_height;
+
+        using TypeInputPtr        = TypeInput *;
+        const int    batch_size   = kernel_hw * output_hw * sizeof(TypeInputPtr);
+        const size_t batch_stride = batch_size / sizeof(TypeInputPtr);
+        const int    multi_size   = batch_size * batches;
+        const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
+
+        _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
+        _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
+        _indirect_pad = std::vector<TypeInput>(_cp.input_channels, zeropad);
+
+        // Set indirect argument
+        int64_t pos = 0;
+        for(int64_t m = 0; m < multis; m++)
+        {
+            for(int64_t b = 0; b < batches; b++)
+            {
+                for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
+                {
+                    (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
+                }
+            }
+        }
+
+        _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get());
+    }
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
 void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
-                                                             arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
+                                                             arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
                                                              MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os)
 {
     arm_gemm::GemmConfig gemm_cfg;
@@ -314,6 +529,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c
             static_cast<Tensor *>(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
         }
     }
+
+    // Handle indirect GEMM convolution
+    if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
+    {
+        configure_indirect(a->info(), b->info(), d->info(), gemm_info);
+    }
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
@@ -354,6 +575,11 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare()
             }
         }
 
+        if(_gemm_info.method == AsmConvMethod::Indirect)
+        {
+            prepare_indirect_buffer();
+        }
+
         _is_prepared = true;
     }
 }
@@ -376,23 +602,23 @@ bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const
 template <typename TypeInput, typename TypeOutput, class OutputStage>
 void Fallback<TypeInput, TypeOutput, OutputStage>::run()
 {
-    const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+    int       lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
     int       ldb = 0;
     const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
 
-    const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d() != 0 ? 3 : 2;
+    const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
     const size_t a_multi_idx = a_batch_idx + 1;
-    const size_t d_batch_idx = _gemm_info.depth_output_gemm3d() != 0 ? 3 : 2;
+    const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
     const size_t d_multi_idx = d_batch_idx + 1;
 
-    const int batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput);
+    int       batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput);
     const int batch_stride_d = _d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput);
 
-    const int multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput);
+    int       multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput);
     int       multi_stride_b = 0;
     const int multi_stride_d = _d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput);
 
-    const auto       in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes());
+    auto             in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes());
     const TypeInput *in1_ptr = nullptr;
     auto             out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer() + _d->info()->offset_first_element_in_bytes());
 
@@ -404,93 +630,92 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run()
         in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
     }
 
+    const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, _d->info()->data_type());
+
     // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
     if(_workspace.buffer() != nullptr)
     {
         _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer()));
+        const unsigned int split_dim   = scheduling_hint.split_dimension();
         const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
         unsigned int       num_threads = NEScheduler::get().num_threads();
         if(window_size < num_threads)
         {
             num_threads = window_size;
-            _gemm_kernel_asm->set_nthreads(num_threads);
         }
+        if(split_dim != IScheduler::split_dimensions_all)
+        {
+            // Make sure the kernel does not expect more threads than we can actually spawn
+            const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
+            num_threads                       = std::min(num_iterations, num_threads);
+        }
+        _gemm_kernel_asm->set_nthreads(num_threads);
     }
 
     // Prepare assembly kernel
     prepare();
 
-    TypeOutput *bias = nullptr;
     // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
+    TypeOutput *bias = nullptr;
     if(_c && _c->info()->data_type() != DataType::S32)
     {
         bias = reinterpret_cast<TypeOutput *>(_c->buffer() + _c->info()->offset_first_element_in_bytes());
     }
+
+    if(_gemm_info.method == AsmConvMethod::Indirect)
+    {
+        in0_ptr        = nullptr;
+        lda            = 0;
+        batch_stride_a = 0;
+        multi_stride_a = 0;
+    }
+
     // Set gemm parameters
     _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
                                  in1_ptr, ldb, multi_stride_b,
                                  out_ptr, ldd, batch_stride_d, multi_stride_d,
                                  bias, 0);
-    // Schedule assembly kernel
-    IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
-    if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && _d->info()->data_type() == DataType::F32)
-    {
-        const int granule_threshold = 200;
-        scheduling_hint             = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
-    }
-    else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (_d->info()->data_type() == DataType::F32 || _d->info()->data_type() == DataType::F16
-                                                                                 || _d->info()->data_type() == DataType::U8 || _d->info()->data_type() == DataType::S8))
-    {
-        //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
-        const int granule_threshold = 200;
-        scheduling_hint             = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
-    }
-    else if(_kernel_info.method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (_d->info()->data_type() == DataType::QASYMM8 || _d->info()->data_type() == DataType::QASYMM8_SIGNED))
-    {
-        //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
-        const int granule_threshold = 200;
-        scheduling_hint             = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
-    }
-
+    // Schedule
     NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
 }
 
 template <typename TypeInput, typename TypeOutput>
 void create_arm_gemm(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
-                     const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
+                     const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
                      IWeightsManager *weights_manager)
 {
-    INEGEMMWrapperKernel::Params p           = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
-    const CPUInfo               &ci          = NEScheduler::get().cpu_info();
-    unsigned int                 num_threads = NEScheduler::get().num_threads();
+    Params         p           = extract_parameters(a, b, d, info);
+    const CPUInfo &ci          = NEScheduler::get().cpu_info();
+    unsigned int   num_threads = NEScheduler::get().num_threads();
 
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, activation, num_threads);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
 
     // Create arm_gemm fallback
     auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput>>();
-    fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager);
+    fallback->configure(a, b, c, d, args, info, memory_group, weights_manager);
     arm_gemm = std::move(fallback);
 }
 
 template <typename TypeInput, typename TypeOutput>
 void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
-                           const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
+                           const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
                            IWeightsManager *weights_manager)
 {
     ARM_COMPUTE_UNUSED(activation);
-    INEGEMMWrapperKernel::Params p           = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
-    const CPUInfo               &ci          = NEScheduler::get().cpu_info();
-    unsigned int                 num_threads = NEScheduler::get().num_threads();
+    Params         p           = extract_parameters(a, b, d, info);
+    const CPUInfo &ci          = NEScheduler::get().cpu_info();
+    unsigned int   num_threads = NEScheduler::get().num_threads();
 
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, activation, num_threads);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
 
     // Create arm_gemm fallback
     auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
 
     // Configure requantization info
-    const int32_t                 a_offset = -a->info()->quantization_info().uniform().offset;
-    const int32_t                 b_offset = -b->info()->quantization_info().uniform().offset;
-    const GEMMLowpOutputStageInfo os_info  = gemm_info.gemmlowp_output_stage();
+    const int32_t                 negation = info.negated_offsets ? 1 : -1;
+    const int32_t                 a_offset = -a->info()->quantization_info().uniform().offset * negation;
+    const int32_t                 b_offset = -b->info()->quantization_info().uniform().offset * negation;
+    const GEMMLowpOutputStageInfo os_info  = info.output_stage;
 
     arm_gemm::Requantize32 gemm_requant_info{};
     if(os_info.gemmlowp_shifts.size() > 1)
@@ -498,7 +723,9 @@ void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &a
         const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
         gemm_requant_info          = arm_gemm::Requantize32(nullptr, 0,
                                                             a_offset, b_offset, os_info.gemmlowp_offset,
-                                                            std::get<0>(requantize_data), std::get<1>(requantize_data),
+                                                            (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr,
+                                                            std::get<2>(requantize_data),
+                                                            std::get<3>(requantize_data),
                                                             os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
     }
     else
@@ -510,7 +737,7 @@ void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &a
     }
 
     // Configure fallback
-    fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager, gemm_requant_info);
+    fallback->configure(a, b, c, d, args, info, memory_group, weights_manager, gemm_requant_info);
     arm_gemm = std::move(fallback);
 }
 
@@ -521,14 +748,13 @@ NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> m
 {
 }
 
-Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info)
+Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
 {
-    ARM_COMPUTE_UNUSED(c);
+    ARM_COMPUTE_UNUSED(c, info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.pretranpose_B());
 #ifndef __aarch64__
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
 #endif /* __aarch64__ */
@@ -559,13 +785,13 @@ bool NEGEMMAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &
     return act.type != arm_gemm::Activation::Type::None;
 }
 
-void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info)
+void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
-    arm_gemm::Activation act = map_to_arm_gemm_activation(gemm_info.activation_info());
+    arm_gemm::Activation act = map_to_arm_gemm_activation(info.activation_info);
 
     //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), gemm_info))
+    if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), info))
     {
         return;
     }
@@ -573,40 +799,40 @@ void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const
     switch(a->info()->data_type())
     {
         case DataType::F32:
-            create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+            create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             break;
 #ifdef __aarch64__
         case DataType::U8:
         case DataType::QASYMM8:
             if(d->info()->data_type() == DataType::S32)
             {
-                create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+                create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             }
             else
             {
-                create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+                create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             }
             break;
         case DataType::S8:
         case DataType::QASYMM8_SIGNED:
             if(d->info()->data_type() == DataType::S32)
             {
-                create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+                create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             }
             else
             {
-                create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+                create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             }
             break;
 #endif /* __aarch64__ */
 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
         case DataType::BFLOAT16:
-            create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+            create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             break;
 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+            create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
new file mode 100644
index 0000000000..642b084fb4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include <set>
+namespace arm_compute
+{
+namespace
+{
+GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const ActivationLayerInfo &act)
+{
+    // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo        iqinfo    = input->quantization_info();
+    const QuantizationInfo        wqinfo    = weights->quantization_info();
+    const QuantizationInfo        oqinfo    = (output->total_size() == 0) ? iqinfo : output->quantization_info();
+    const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
+    const DataType                data_type = input->data_type();
+    // Merge activation with output stage
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                             };
+    PixelValue type_min{};
+    PixelValue type_max{};
+    std::tie(type_min, type_max) = get_min_max(data_type);
+    int32_t min_activation = type_min.get<int32_t>();
+    int32_t max_activation = type_max.get<int32_t>();
+    if(supported_acts.count(act.activation()) != 0)
+    {
+        std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
+    }
+    GEMMLowpOutputStageInfo os_info;
+    os_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    os_info.gemmlowp_offset          = uoqinfo.offset;
+    os_info.gemmlowp_min_bound       = min_activation;
+    os_info.gemmlowp_max_bound       = max_activation;
+    os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+    quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info);
+    return os_info;
+}
+AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect)
+{
+    AsmGemmInfo asm_info;
+    asm_info.method                  = is_indirect ? AsmConvMethod::Indirect : AsmConvMethod::Conv;
+    asm_info.ps_info                 = info.conv_info;
+    asm_info.activation_info         = info.act_info;
+    asm_info.depth_output_gemm3d     = true;
+    asm_info.reinterpret_input_as_3d = true;
+    asm_info.padding_top             = info.conv_info.pad_top();
+    asm_info.padding_left            = info.conv_info.pad_left();
+    asm_info.padding_value           = 0.f;
+    asm_info.negated_offsets         = false;
+    return asm_info;
+}
+} // namespace
+
+NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
+    : _gemm_asm_func(memory_manager), _activation_func(), _weights_permute_func(), _original_weights(nullptr), _permuted_weights(), _is_prepared(false), _run_activation(false)
+{
+}
+void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConv2d::validate(input->info(),
+                                                      weights->info(),
+                                                      biases != nullptr ? biases->info() : nullptr,
+                                                      output->info(),
+                                                      info));
+    _original_weights = weights;
+    _weights_permute_func.configure(weights, &_permuted_weights, PermutationVector{ 3, 0, 1, 2 });
+
+    // Configure assembly dispatch
+    AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+    if(is_data_type_quantized(input->info()->data_type()))
+    {
+        asm_info.output_stage = calculate_output_stage_metadata(input->info(), weights->info(), output->info(), info.act_info);
+    }
+    _gemm_asm_func.configure(input, &_permuted_weights, biases, output, asm_info);
+
+    // Configure activation
+    if(info.act_info.enabled() && !_gemm_asm_func.is_activation_supported(info.act_info))
+    {
+        _activation_func.configure(output, nullptr, info.act_info);
+        _run_activation = true;
+    }
+}
+Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC");
+    const DataType    data_type = input->data_type();
+    const TensorShape i_shape   = input->tensor_shape();
+    const TensorShape w_shape   = weights->tensor_shape();
+    ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+    // Validate biases
+    if(biases != nullptr)
+    {
+        if(is_data_type_quantized_asymmetric(data_type))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else if(data_type == DataType::BFLOAT16)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMAssemblyDispatch::validate(input, weights, biases, output, asm_info));
+    return Status{};
+}
+void NEGEMMConv2d::run()
+{
+    prepare();
+
+    _gemm_asm_func.run();
+    if(_run_activation)
+    {
+        _activation_func.run();
+    }
+}
+void NEGEMMConv2d::prepare()
+{
+    if(!_is_prepared)
+    {
+        _permuted_weights.allocator()->allocate();
+        _weights_permute_func.run();
+        _original_weights->mark_as_unused();
+        _is_prepared = true;
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 834a66a867..3f50f81af2 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -30,6 +30,21 @@
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "src/core/NEON/kernels/NECol2ImKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEIm2ColKernel.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "support/MemorySupport.h"
+
 #include <set>
 #include <tuple>
 
@@ -37,6 +52,7 @@ namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 
+NEConvolutionLayerReshapeWeights::~NEConvolutionLayerReshapeWeights() = default;
 NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
     : _weights_reshape_kernel()
 {
@@ -52,7 +68,8 @@ void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const I
     const bool     append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
     const ITensor *biases_to_use = (append_biases) ? biases : nullptr;
 
-    _weights_reshape_kernel.configure(weights, biases_to_use, output);
+    _weights_reshape_kernel = arm_compute::support::cpp14::make_unique<NEWeightsReshapeKernel>();
+    _weights_reshape_kernel->configure(weights, biases_to_use, output);
 
     output->info()->set_quantization_info(weights->info()->quantization_info());
 }
@@ -86,9 +103,11 @@ Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, co
 
 void NEConvolutionLayerReshapeWeights::run()
 {
-    NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+    NEScheduler::get().schedule(_weights_reshape_kernel.get(), 3);
 }
 
+NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
+
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
     : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager),
       _col2im_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _skip_im2col(false),
@@ -323,7 +342,8 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
         _memory_group.manage(&_im2col_output);
 
         // Configure
-        _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
+        _im2col_kernel = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
+        _im2col_kernel->configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
 
         // Update GEMM input
         gemm_input_to_use = &_im2col_output;
@@ -365,7 +385,8 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
         if(_data_layout == DataLayout::NCHW)
         {
             // Configure col2im
-            _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h));
+            _col2im_kernel = arm_compute::support::cpp14::make_unique<NECol2ImKernel>();
+            _col2im_kernel->configure(gemm_output_to_use, output, Size2D(conv_w, conv_h));
         }
         else
         {
@@ -538,7 +559,7 @@ void NEGEMMConvolutionLayer::run()
     {
         // Run input reshaping
         unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-        NEScheduler::get().schedule(&_im2col_kernel, y_dim);
+        NEScheduler::get().schedule(_im2col_kernel.get(), y_dim);
     }
 
     // Runs NEGEMM or NEGEMMLowpMatrixMultiplyCore functions
@@ -558,7 +579,7 @@ void NEGEMMConvolutionLayer::run()
     {
         if(_data_layout == DataLayout::NCHW)
         {
-            NEScheduler::get().schedule(&_col2im_kernel, Window::DimY);
+            NEScheduler::get().schedule(_col2im_kernel.get(), Window::DimY);
         }
         else
         {
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
index ad306c3662..70fdcf492d 100644
--- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
 
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
deleted file mode 100644
index 6d52f2b15c..0000000000
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/MemorySupport.h"
-
-using namespace arm_compute;
-
-NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b()
-{
-}
-
-void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::S8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
-    ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
-
-    bool run_optimised = false;
-    switch(a->info()->data_type())
-    {
-        case DataType::S8:
-        case DataType::QASYMM8:
-        case DataType::U8:
-        {
-            _asm_glue.configure(a, b, c, output, GEMMInfo(false, false, true));
-            run_optimised = _asm_glue.is_configured();
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Datatype not supported");
-            break;
-        }
-    }
-    if(!run_optimised)
-    {
-        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        TensorShape shape_tmp_a = a->info()->tensor_shape();
-        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
-        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-        TensorShape shape_tmp_b = b->info()->tensor_shape();
-        shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
-        _tmp_a.allocator()->init(info_a);
-        _tmp_b.allocator()->init(info_b);
-        _memory_group.manage(&_tmp_a);
-        _memory_group.manage(&_tmp_b);
-
-        // Configure interleave kernel
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
-            k->configure(a, &_tmp_a);
-            _mtx_a_reshape_kernel = std::move(k);
-        }
-
-        // Configure transpose kernel
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
-            k->configure(b, &_tmp_b);
-            _mtx_b_reshape_kernel = std::move(k);
-        }
-
-        // Configure matrix multiply kernel
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-            k->configure(&_tmp_a, &_tmp_b, output);
-            _mm_kernel = std::move(k);
-        }
-
-        // Allocate tensors
-        _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
-    }
-}
-
-void NEGEMMLowpAssemblyMatrixMultiplyCore::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-    if(_mtx_a_reshape_kernel)
-    {
-        NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
-    }
-
-    if(_mtx_b_reshape_kernel)
-    {
-        NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
-    }
-
-    if(_asm_glue.is_configured())
-    {
-        _asm_glue.run();
-    }
-    else
-    {
-        NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
-    }
-}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index dada6d16da..df8eaacf47 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -33,12 +33,39 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+namespace
+{
+AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+{
+    AsmGemmInfo asm_info;
+    asm_info.method                  = AsmConvMethod::Im2Col;
+    asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+    asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
+    asm_info.activation_info         = info.activation_info();
+    asm_info.output_stage            = info.gemmlowp_output_stage();
+
+    return asm_info;
+}
+} // namespace
+
 using namespace arm_compute::misc::shape_calculator;
 
+NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
+
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(memory_manager, weights_manager), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(),
       _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(),
@@ -79,7 +106,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
 
         _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
         _memory_group.manage(&_signed_a);
-        _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
+        _convert_to_signed_asymm = arm_compute::support::cpp14::make_unique<NEConvertQuantizedSignednessKernel>();
+        _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
         a_to_use  = &_signed_a;
         _a_offset = _signed_a.info()->quantization_info().uniform().offset;
 
@@ -107,6 +135,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         _mm_result_s32.allocator()->init(info_mm_result_s32);
     }
 
+    // Initialize assembly kernel meta-data
+    const AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
 #ifdef __aarch64__
     switch(a->info()->data_type())
     {
@@ -117,22 +147,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         {
             if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
             {
-                // Result shifts < 0 are not supported by asm kernels
-                const std::vector<int32_t> &shifts           = info.gemmlowp_output_stage().gemmlowp_shifts;
-                const bool                  is_asm_supported = info.gemmlowp_output_stage().gemmlowp_shift >= 0
-                                                               && std::all_of(shifts.cbegin(), shifts.cend(), [](int32_t val)
-                {
-                    return val >= 0;
-                });
-                if(is_asm_supported)
-                {
-                    _asm_glue.configure(a_to_use, b, c, output, gemm_info);
-                    _fused_assembly_path = _asm_glue.is_configured();
-                }
+                _asm_glue.configure(a_to_use, b, c, output, asm_info);
+                _fused_assembly_path = _asm_glue.is_configured();
             }
             else
             {
-                _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
+                _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, asm_info);
             }
             _assembly_path = _asm_glue.is_configured();
             break;
@@ -162,10 +182,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         }
 
         // Configure interleave kernel
-        _mtx_a_reshape_kernel.configure(a_to_use, &_tmp_a);
+        _mtx_a_reshape_kernel = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+        _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
 
         // Configure transpose kernel
-        _mtx_b_reshape_kernel.configure(b, &_tmp_b);
+        _mtx_b_reshape_kernel = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+        _mtx_b_reshape_kernel->configure(b, &_tmp_b);
     }
 
     if(!_fused_assembly_path)
@@ -185,7 +207,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
             }
 
             // Configure Matrix B reduction kernel
-            _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, reduction_info);
+            _mtx_b_reduction_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixBReductionKernel>();
+            _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
         }
 
         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
@@ -197,7 +220,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
             _memory_group.manage(&_vector_sum_row);
 
             // Configure matrix A reduction kernel
-            _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, reduction_info);
+            _mtx_a_reduction_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+            _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
         }
 
         if(_fuse_output_stage)
@@ -205,19 +229,22 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
             // Configure matrix multiply kernel
             if(!_assembly_path)
             {
-                _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32);
+                _mm_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
             }
 
-            _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
-                                                               _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                               _b_offset == 0 ? nullptr : &_vector_sum_row, c,
-                                                               _flip_signedness ? &_signed_output : output,
-                                                               a->info()->dimension(0),
-                                                               _a_offset, _b_offset, info.gemmlowp_output_stage());
+            _offset_contribution_output_stage_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpOffsetContributionOutputStageKernel>();
+            _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
+                                                                _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                                                _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+                                                                _flip_signedness ? &_signed_output : output,
+                                                                a->info()->dimension(0),
+                                                                _a_offset, _b_offset, info.gemmlowp_output_stage());
 
             if(_flip_signedness)
             {
-                _convert_from_signed_asymm.configure(&_signed_output, output);
+                _convert_from_signed_asymm = arm_compute::support::cpp14::make_unique<NEConvertQuantizedSignednessKernel>();
+                _convert_from_signed_asymm->configure(&_signed_output, output);
             }
         }
         else
@@ -225,10 +252,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
             // Configure matrix multiply kernel
             if(!_assembly_path)
             {
-                _mm_kernel.configure(matrix_a, matrix_b, output);
+                _mm_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                _mm_kernel->configure(matrix_a, matrix_b, output);
             }
             // Configure offset contribution kernel
-            _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
+            _offset_contribution_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpOffsetContributionKernel>();
+            _offset_contribution_kernel->configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
         }
 
         // Configure activation
@@ -334,28 +363,20 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
         matrix_a_info = &signed_a;
     }
 
+    // Initialize assembly kernel meta-data
+    const AsmGemmInfo asm_info = init_assembly_metadata(info);
+
     // Check if we need to run the optimized assembly kernel
     bool run_optimised             = false;
     bool run_optimised_requantized = false;
     if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        // Result shifts < 0 are not supported by asm kernels
-        const std::vector<int32_t> &shifts           = info.gemmlowp_output_stage().gemmlowp_shifts;
-        const bool                  is_asm_supported = info.gemmlowp_output_stage().gemmlowp_shift >= 0
-                                                       && std::all_of(shifts.cbegin(), shifts.cend(), [](int32_t val)
-        {
-            return val >= 0;
-        });
-
-        if(is_asm_supported)
-        {
-            run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
-            run_optimised_requantized = run_optimised;
-        }
+        run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
+        run_optimised_requantized = run_optimised;
     }
     else
     {
-        run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
+        run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
     }
 
     if(run_optimised)
@@ -488,7 +509,7 @@ void NEGEMMLowpMatrixMultiplyCore::run()
     // Convert QASYMM8->QASYMM8_SIGNED
     if(_flip_signedness)
     {
-        NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
+        NEScheduler::get().schedule(_convert_to_signed_asymm.get(), Window::DimY);
     }
 
     // Run GEMM
@@ -501,15 +522,15 @@ void NEGEMMLowpMatrixMultiplyCore::run()
         if(!_run_vector_matrix_multiplication)
         {
             // Run interleave kernel
-            NEScheduler::get().schedule(&_mtx_a_reshape_kernel, Window::DimY);
+            NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
 
             if(!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
+                NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
             }
         }
-        NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+        NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
     }
 
     if(!_fused_assembly_path)
@@ -517,31 +538,31 @@ void NEGEMMLowpMatrixMultiplyCore::run()
         // Run matrix A reduction kernel only if _b_offset is not equal to 0
         if(_b_offset != 0)
         {
-            NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+            NEScheduler::get().schedule(_mtx_a_reduction_kernel.get(), Window::DimX);
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
         if(_a_offset != 0 && !_reshape_b_only_on_first_run)
         {
-            NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+            NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
         }
 
         if(_fuse_output_stage)
         {
             // Run offset contribution kernel
-            NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+            NEScheduler::get().schedule(_offset_contribution_output_stage_kernel.get(), Window::DimY);
         }
         else
         {
             // Run offset contribution kernel
-            NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+            NEScheduler::get().schedule(_offset_contribution_kernel.get(), Window::DimY);
         }
     }
 
     // Convert QASYMM8_SIGNED->QASYMM8
-    if(_flip_signedness)
+    if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
     {
-        NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
+        NEScheduler::get().schedule(_convert_from_signed_asymm.get(), Window::DimY);
     }
 
     // Run fused activation unless already run in the fused assembly
@@ -580,7 +601,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare()
 
             // Run reshape kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
-            NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
+            NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
             if(!original_b_managed_by_weights_manager)
             {
                 _original_b->mark_as_unused();
@@ -591,7 +612,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare()
         if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
         {
             _vector_sum_col.allocator()->allocate();
-            NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+            NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
         }
 
         _is_prepared = true;
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 239a8e668a..9fb8851d7a 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -24,15 +24,17 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint() = default;
+
 void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
                                                                     int result_offset_after_shift, int min, int max)
 {
@@ -46,6 +48,8 @@ Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITens
     return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
 }
 
+NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint() = default;
+
 void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
                                                                    int result_offset_after_shift, int min, int max)
 {
@@ -59,6 +63,8 @@ Status NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITenso
     return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
 }
 
+NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint() = default;
+
 void NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min, int max)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
@@ -71,6 +77,8 @@ Status NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITens
     return NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max);
 }
 
+NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default;
+
 void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info)
 {
     // Perform validate step
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
index e807e86299..90cf0bab07 100644
--- a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
@@ -25,9 +25,9 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
index 5238936015..5c0dae1507 100644
--- a/src/runtime/NEON/functions/NEGather.cpp
+++ b/src/runtime/NEON/functions/NEGather.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGather.h"
 
-#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
+#include "src/core/NEON/kernels/NEGatherKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEGaussian3x3.cpp b/src/runtime/NEON/functions/NEGaussian3x3.cpp
index fba49ede2a..5290de1348 100644
--- a/src/runtime/NEON/functions/NEGaussian3x3.cpp
+++ b/src/runtime/NEON/functions/NEGaussian3x3.cpp
@@ -23,18 +23,23 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
 
-#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEGaussian3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEGaussian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEGaussian3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index 99591f4107..7857710462 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,17 @@
 #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEGaussian5x5::~NEGaussian5x5() = default;
 
 NEGaussian5x5::NEGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
@@ -46,21 +50,26 @@ void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border
     // Manage intermediate buffers
     _memory_group.manage(&_tmp);
 
+    _kernel_hor     = arm_compute::support::cpp14::make_unique<NEGaussian5x5HorKernel>();
+    _kernel_vert    = arm_compute::support::cpp14::make_unique<NEGaussian5x5VertKernel>();
+    _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+
     // Create and configure kernels for the two passes
-    _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
-    _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
+    _kernel_hor->configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
+    _kernel_vert->configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
 
     _tmp.allocator()->allocate();
 
-    _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(input, _kernel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 void NEGaussian5x5::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
-    NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+    NEScheduler::get().schedule(_kernel_hor.get(), Window::DimY);
+    NEScheduler::get().schedule(_kernel_vert.get(), Window::DimY);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index ae883bcb20..30fe70f0ab 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -25,16 +25,18 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
 #include "arm_compute/runtime/Pyramid.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "src/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "src/core/NEON/kernels/NEScaleKernel.h"
+#include "support/MemorySupport.h"
 
 #include <cstddef>
 
@@ -45,6 +47,8 @@ NEGaussianPyramid::NEGaussianPyramid()
 {
 }
 
+NEGaussianPyramidHalf::~NEGaussianPyramidHalf() = default;
+
 NEGaussianPyramidHalf::NEGaussianPyramidHalf() // NOLINT
     : _horizontal_border_handler(),
       _vertical_border_handler(),
@@ -94,16 +98,20 @@ void NEGaussianPyramidHalf::configure(const ITensor *input, IPyramid *pyramid, B
         for(size_t i = 0; i < num_stages; ++i)
         {
             /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
+            _horizontal_reduction[i] = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel>();
+            _horizontal_reduction[i]->configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
 
             /* Configure vertical kernel */
-            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
+            _vertical_reduction[i] = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel>();
+            _vertical_reduction[i]->configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
 
             /* Configure border */
-            _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+            _horizontal_border_handler[i] = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+            _horizontal_border_handler[i]->configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i]->border_size(), border_mode, PixelValue(constant_border_value));
 
             /* Configure border */
-            _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
+            _vertical_border_handler[i] = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+            _vertical_border_handler[i]->configure(_tmp.get_pyramid_level(i), _vertical_reduction[i]->border_size(), border_mode, PixelValue(pixel_value_u16));
         }
 
         _tmp.allocate();
@@ -122,13 +130,15 @@ void NEGaussianPyramidHalf::run()
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        NEScheduler::get().schedule(&_horizontal_border_handler[i], Window::DimZ);
-        NEScheduler::get().schedule(&_horizontal_reduction[i], Window::DimY);
-        NEScheduler::get().schedule(&_vertical_border_handler[i], Window::DimZ);
-        NEScheduler::get().schedule(&_vertical_reduction[i], Window::DimY);
+        NEScheduler::get().schedule(_horizontal_border_handler[i].get(), Window::DimZ);
+        NEScheduler::get().schedule(_horizontal_reduction[i].get(), Window::DimY);
+        NEScheduler::get().schedule(_vertical_border_handler[i].get(), Window::DimZ);
+        NEScheduler::get().schedule(_vertical_reduction[i].get(), Window::DimY);
     }
 }
 
+NEGaussianPyramidOrb::~NEGaussianPyramidOrb() = default;
+
 NEGaussianPyramidOrb::NEGaussianPyramidOrb() // NOLINT
     : _gaus5x5(),
       _scale_nearest()
@@ -168,7 +178,7 @@ void NEGaussianPyramidOrb::configure(const ITensor *input, IPyramid *pyramid, Bo
             _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
 
             /* Configure scale */
-            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED });
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED, PixelValue(), SamplingPolicy::CENTER, false });
         }
 
         _tmp.allocate();
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 3d5377892a..d9a498e4bd 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -25,18 +25,22 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
 NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager),
-      _permute_deltas_kernel(),
+      _permute_deltas(),
       _flatten_deltas(),
-      _permute_scores_kernel(),
+      _permute_scores(),
       _flatten_scores(),
-      _compute_anchors_kernel(),
-      _bounding_box_kernel(),
-      _pad_kernel(),
+      _compute_anchors(),
+      _bounding_box(),
+      _pad(),
       _dequantize_anchors(),
       _dequantize_deltas(),
       _quantize_all_proposals(),
@@ -61,6 +65,8 @@ NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManage
 {
 }
 
+NEGenerateProposalsLayer::~NEGenerateProposalsLayer() = default;
+
 void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
@@ -84,7 +90,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
-    _compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
     _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
@@ -94,7 +100,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
     if(!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
         _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
@@ -111,7 +117,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
     if(!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_scores.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
         _flatten_scores.configure(&_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
@@ -140,7 +146,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
     // Bounding box transform
     _memory_group.manage(&_all_proposals);
     BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
-    _bounding_box_kernel.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
+    _bounding_box.configure(anchors_to_use, &_all_proposals, deltas_to_use, bbox_info);
     deltas_to_use->allocator()->allocate();
     anchors_to_use->allocator()->allocate();
 
@@ -196,7 +202,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
     _proposals_4_roi_values.allocator()->allocate();
 }
 
@@ -228,7 +234,7 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
     }
 
     TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchors::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
 
     TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
     TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
@@ -239,8 +245,8 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
     }
 
     TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
@@ -257,25 +263,25 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
     if(is_qasymm8)
     {
         TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(&all_anchors_info, &all_anchors_f32_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
 
         TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
 
         TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+                                                                     BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
         proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                                                     BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
 
     if(num_valid_proposals->total_size() > 0)
     {
@@ -318,13 +324,13 @@ void NEGenerateProposalsLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Compute all the anchors
-    NEScheduler::get().schedule(&_compute_anchors_kernel, Window::DimY);
+    _compute_anchors.run();
 
     // Transpose and reshape the inputs
     if(!_is_nhwc)
     {
-        NEScheduler::get().schedule(&_permute_deltas_kernel, Window::DimY);
-        NEScheduler::get().schedule(&_permute_scores_kernel, Window::DimY);
+        _permute_deltas.run();
+        _permute_scores.run();
     }
 
     _flatten_deltas.run();
@@ -332,22 +338,22 @@ void NEGenerateProposalsLayer::run()
 
     if(_is_qasymm8)
     {
-        NEScheduler::get().schedule(&_dequantize_anchors, Window::DimY);
-        NEScheduler::get().schedule(&_dequantize_deltas, Window::DimY);
+        _dequantize_anchors.run();
+        _dequantize_deltas.run();
     }
 
     // Build the boxes
-    NEScheduler::get().schedule(&_bounding_box_kernel, Window::DimY);
+    _bounding_box.run();
 
     if(_is_qasymm8)
     {
-        NEScheduler::get().schedule(&_quantize_all_proposals, Window::DimY);
+        _quantize_all_proposals.run();
     }
 
     // Non maxima suppression
     _cpp_nms.run();
 
     // Add dummy batch indexes
-    NEScheduler::get().schedule(&_pad_kernel, Window::DimY);
+    _pad.run();
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index 10765f9b86..689e64fae7 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,14 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEDerivativeKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEHOGDescriptor::~NEHOGDescriptor() = default;
 
 NEHOGDescriptor::NEHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
@@ -82,10 +88,12 @@ void NEHOGDescriptor::configure(ITensor *input, ITensor *output, const IHOG *hog
     _memory_group.manage(&_hog_space);
 
     // Initialise orientation binning kernel
-    _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+    _orient_bin = arm_compute::support::cpp14::make_unique<NEHOGOrientationBinningKernel>();
+    _orient_bin->configure(&_mag, &_phase, &_hog_space, hog->info());
 
     // Initialize HOG norm kernel
-    _block_norm.configure(&_hog_space, output, hog->info());
+    _block_norm = arm_compute::support::cpp14::make_unique<NEHOGBlockNormalizationKernel>();
+    _block_norm->configure(&_hog_space, output, hog->info());
 
     // Allocate intermediate tensors
     _mag.allocator()->allocate();
@@ -101,8 +109,9 @@ void NEHOGDescriptor::run()
     _gradient.run();
 
     // Run orientation binning kernel
-    NEScheduler::get().schedule(&_orient_bin, Window::DimY);
+    NEScheduler::get().schedule(_orient_bin.get(), Window::DimY);
 
     // Run block normalization kernel
-    NEScheduler::get().schedule(&_block_norm, Window::DimY);
+    NEScheduler::get().schedule(_block_norm.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
index 21db5f83b7..8468b75f4e 100644
--- a/src/runtime/NEON/functions/NEHOGDetector.cpp
+++ b/src/runtime/NEON/functions/NEHOGDetector.cpp
@@ -23,10 +23,12 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
 
-#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "src/core/NEON/kernels/NEHOGDetectorKernel.h"
 #include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEHOGDetector::~NEHOGDetector() = default;
 
 void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
 {
@@ -34,3 +36,4 @@ void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionW
     k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class);
     _kernel = std::move(k);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index 8f3559a7ed..7d794bc1a0 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -23,12 +23,16 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
 
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEDerivativeKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEHOGGradient::~NEHOGGradient() = default;
 
 NEHOGGradient::NEHOGGradient(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
@@ -88,3 +92,4 @@ void NEHOGGradient::run()
     // Run magnitude/phase kernel
     NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index e08b699e1c..3e41faad43 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -28,8 +28,13 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NEDerivativeKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEHOGDescriptorKernel.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEHOGMultiDetection::~NEHOGMultiDetection() = default;
 
 NEHOGMultiDetection::NEHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
@@ -262,3 +267,4 @@ void NEHOGMultiDetection::run()
         NEScheduler::get().schedule(&_non_maxima_kernel, Window::DimY);
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index 3c51eb2249..23fcf8c805 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -24,8 +24,6 @@
 #include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Array.h"
@@ -34,12 +32,19 @@
 #include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
 #include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "src/core/NEON/kernels/NESobel7x7Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <cmath>
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEHarrisCorners::~NEHarrisCorners() = default;
 
 NEHarrisCorners::NEHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
@@ -154,8 +159,10 @@ void NEHarrisCorners::configure(IImage *input, float threshold, float min_dist,
     }
 
     // Configure border filling before harris score
-    _border_gx.configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
-    _border_gy.configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
+    _border_gx = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    _border_gy = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    _border_gx->configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
+    _border_gy->configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
 
     // Allocate once all the configure methods have been called
     _gx.allocator()->allocate();
@@ -193,8 +200,8 @@ void NEHarrisCorners::run()
     _sobel->run();
 
     // Fill border before harris score kernel
-    NEScheduler::get().schedule(&_border_gx, Window::DimZ);
-    NEScheduler::get().schedule(&_border_gy, Window::DimZ);
+    NEScheduler::get().schedule(_border_gx.get(), Window::DimZ);
+    NEScheduler::get().schedule(_border_gy.get(), Window::DimZ);
 
     // Run harris score kernel
     NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
@@ -208,3 +215,4 @@ void NEHarrisCorners::run()
     // Run sort & euclidean distance
     NEScheduler::get().schedule(&_sort_euclidean, Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index 39fad977af..40ea3a16c6 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -29,8 +29,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEHistogramKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEHistogram::~NEHistogram() = default;
 
 NEHistogram::NEHistogram()
     : _histogram_kernel(), _local_hist(), _window_lut(window_lut_default_size), _local_hist_size(0)
@@ -47,11 +51,13 @@ void NEHistogram::configure(const IImage *input, IDistribution1D *output)
     _local_hist.resize(_local_hist_size);
 
     // Configure kernel
-    _histogram_kernel.configure(input, output, _local_hist.data(), _window_lut.data());
+    _histogram_kernel = arm_compute::support::cpp14::make_unique<NEHistogramKernel>();
+    _histogram_kernel->configure(input, output, _local_hist.data(), _window_lut.data());
 }
 
 void NEHistogram::run()
 {
     // Calculate histogram of input.
-    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
+    NEScheduler::get().schedule(_histogram_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index 99e5d3f1df..bc0c60112e 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -25,9 +25,13 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEIm2ColKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEIm2Col::~NEIm2Col() = default;
+
 NEIm2Col::NEIm2Col()
     : _kernel(), _y_dim(1)
 {
@@ -37,7 +41,8 @@ void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &ke
 {
     _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
-    _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    _kernel = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
+    _kernel->configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
 }
 
 Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
@@ -48,6 +53,6 @@ Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, c
 
 void NEIm2Col::run()
 {
-    NEScheduler::get().schedule(&_kernel, _y_dim);
+    NEScheduler::get().schedule(_kernel.get(), _y_dim);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
index 57d01ff2d6..e3fb284796 100644
--- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
@@ -26,9 +26,13 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default;
+
 NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
 {
@@ -42,6 +46,8 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl
     // Configure Kernels
     _is_nchw = data_layout == DataLayout::NCHW;
 
+    _normalization_kernel = arm_compute::support::cpp14::make_unique<NEInstanceNormalizationLayerKernel>();
+
     if(!_is_nchw)
     {
         _memory_group.manage(&_permuted_input);
@@ -51,7 +57,7 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl
         _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
         _permuted_input.info()->set_data_layout(DataLayout::NCHW);
 
-        _normalization_kernel.configure(&_permuted_input, &_permuted_output, kernel_descriptor);
+        _normalization_kernel->configure(&_permuted_input, &_permuted_output, kernel_descriptor);
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
 
         _permute_output.configure(&_permuted_output, output != nullptr ? output : input, PermutationVector(2U, 0U, 1U));
@@ -60,7 +66,7 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl
     }
     else
     {
-        _normalization_kernel.configure(input, output, kernel_descriptor);
+        _normalization_kernel->configure(input, output, kernel_descriptor);
     }
 }
 
@@ -81,7 +87,7 @@ void NEInstanceNormalizationLayer::run()
         _permute_input.run();
     }
 
-    NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ);
+    NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ);
 
     // Permute output
     if(!_is_nchw)
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index 8ab6bbd76d..63bcd53373 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -23,18 +23,25 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
 
-#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEIntegralImageKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEIntegralImage::~NEIntegralImage() = default;
 
 void NEIntegralImage::configure(const ITensor *input, ITensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
-    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 04cf3a233a..4a99968cc3 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -32,6 +35,7 @@ namespace
 {
 constexpr int max_input_tensor_dim = 3;
 } // namespace
+NEL2NormalizeLayer::~NEL2NormalizeLayer() = default;
 
 NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
@@ -46,7 +50,8 @@ void NEL2NormalizeLayer::configure(ITensor *input, ITensor *output, int axis, fl
     // Configure Kernels
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
     _reduce_func.configure(input, &_sumsq, actual_axis, ReductionOperation::SUM_SQUARE);
-    _normalize_kernel.configure(input, &_sumsq, output, axis, epsilon);
+    _normalize_kernel = arm_compute::support::cpp14::make_unique<NEL2NormalizeLayerKernel>();
+    _normalize_kernel->configure(input, &_sumsq, output, axis, epsilon);
 
     // Allocate intermediate tensors
     _sumsq.allocator()->allocate();
@@ -78,6 +83,6 @@ void NEL2NormalizeLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     _reduce_func.run();
-    NEScheduler::get().schedule(&_normalize_kernel, Window::DimY);
+    NEScheduler::get().schedule(_normalize_kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index dca274acd2..48d69bd6fc 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -29,12 +29,24 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::utils::info_helpers;
 
+NELSTMLayer::~NELSTMLayer() = default;
+
 NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
       _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
@@ -575,8 +587,8 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     }
 
     // Validate copy kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(&cell_state_tmp, cell_state_out));
-    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(output_state_out, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(&cell_state_tmp, cell_state_out));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(output_state_out, output));
 
     // Validate scratch concatenation
     std::vector<const ITensorInfo *> inputs_vector_info_raw;
@@ -646,7 +658,7 @@ void NELSTMLayer::run()
     }
 
     _fully_connected_cell_state.run();
-    NEScheduler::get().schedule(&_transpose_cell_state, Window::DimY);
+    _transpose_cell_state.run();
     _gemm_cell_state1.run();
     _accum_cell_state1.run();
     if(_is_layer_norm_lstm)
@@ -691,8 +703,8 @@ void NELSTMLayer::run()
         }
     }
 
-    NEScheduler::get().schedule(&_copy_cell_state, Window::DimY);
-    NEScheduler::get().schedule(&_copy_output, Window::DimY);
+    _copy_cell_state.run();
+    _copy_output.run();
 
     _concat_scratch_buffer.run();
 }
diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index 11989d3225..e43929390e 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,17 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <cmath>
 #include <memory>
@@ -41,6 +52,7 @@ const QuantizationInfo qsymm_3(8.f / 32768.f, 0);  // qsymm16 with 3 integer bit
 const QuantizationInfo qsymm_4(16.f / 32768.f, 0); // qsymm16 with 4 integer bit
 const QuantizationInfo qsymm_0(1.f / 32768.f, 0);  // qsymm16 with 0 integer bit
 } // namespace
+NELSTMLayerQuantized::~NELSTMLayerQuantized() = default;
 
 NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index 4f0639b64b..a2651dbf36 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -29,11 +29,15 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "src/core/NEON/kernels/NEGaussianPyramidKernel.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NELaplacianPyramid::~NELaplacianPyramid() = default;
 
 NELaplacianPyramid::NELaplacianPyramid() // NOLINT
     : _num_levels(0),
@@ -105,3 +109,4 @@ void NELaplacianPyramid::configure(const ITensor *input, IPyramid *pyramid, ITen
     _gauss_pyr.allocate();
     _conv_pyr.allocate();
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index 24755fc99b..a50e7ccbef 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
 
+#include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/IPyramid.h"
 #include "arm_compute/core/ITensor.h"
@@ -31,7 +32,9 @@
 
 #include <cstddef>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NELaplacianReconstruct::~NELaplacianReconstruct() = default;
 
 NELaplacianReconstruct::NELaplacianReconstruct() // NOLINT
     : _tmp_pyr(),
@@ -73,7 +76,7 @@ void NELaplacianReconstruct::configure(const IPyramid *pyramid, ITensor *input,
     // Scale levels n-1 to 1, and add levels n-2 to 0
     for(size_t l = 0; l < last_level; ++l)
     {
-        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), ScaleKernelInfo{ arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value });
+        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), ScaleKernelInfo{ arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value, SamplingPolicy::CENTER, false });
         _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
     }
 
@@ -100,3 +103,4 @@ void NELaplacianReconstruct::run()
 
     _depthf.run();
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index af502be1e9..131ac82ba8 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,12 +27,16 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEIm2ColKernel.h"
+#include "src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "support/MemorySupport.h"
 
 #include <cmath>
 #include <tuple>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -70,9 +74,10 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons
     shape_gemm.set(1, mat_input_rows);
 }
 } // namespace
+NELocallyConnectedLayer::~NELocallyConnectedLayer() = default;
 
 NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+    : _memory_group(std::move(memory_manager)), _input_im2col(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
       _is_prepared(false), _original_weights(nullptr)
 {
 }
@@ -113,10 +118,10 @@ Status NELocallyConnectedLayer::validate(const ITensorInfo *input, const ITensor
     TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
     TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2Col::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
     ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
     ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECol2Im::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
 
     return Status{};
 }
@@ -154,10 +159,12 @@ void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *wei
     _memory_group.manage(&_gemm_output);
 
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
-    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
-    _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
-    _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
+    _input_im2col.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
+    _weights_reshape_kernel = arm_compute::support::cpp14::make_unique<NEWeightsReshapeKernel>();
+    _weights_reshape_kernel->configure(weights, biases, &_weights_reshaped);
+    _mm_kernel = arm_compute::support::cpp14::make_unique<NELocallyConnectedMatrixMultiplyKernel>();
+    _mm_kernel->configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
 
     // Allocate intermediate tensors
     _input_im2col_reshaped.allocator()->allocate();
@@ -171,13 +178,13 @@ void NELocallyConnectedLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Run input reshaping
-    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+    _input_im2col.run();
 
     // Runs GEMM on reshaped matrices
-    NEScheduler::get().schedule(&_mm_kernel, Window::DimX);
+    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimX);
 
     // Reshape output matrix
-    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+    _output_col2im.run();
 }
 
 void NELocallyConnectedLayer::prepare()
@@ -188,9 +195,10 @@ void NELocallyConnectedLayer::prepare()
 
         // Run weights reshaping and mark original weights tensor as unused
         _weights_reshaped.allocator()->allocate();
-        NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+        NEScheduler::get().schedule(_weights_reshape_kernel.get(), 3);
         _original_weights->mark_as_unused();
 
         _is_prepared = true;
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp
new file mode 100644
index 0000000000..8e43d60bef
--- /dev/null
+++ b/src/runtime/NEON/functions/NELogical.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELogical.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NELogicalKernel.h"
+#include "support/MemorySupport.h"
+
+namespace arm_compute
+{
+struct LogicalArgs
+{
+    std::unique_ptr<kernels::NELogicalKernel> kernel{ nullptr };
+    ITensorPack                               pack{};
+};
+
+struct NELogicalAnd::Impl : public LogicalArgs
+{
+};
+NELogicalAnd::NELogicalAnd()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NELogicalAnd &NELogicalAnd::operator=(NELogicalAnd &&) = default;
+NELogicalAnd::~NELogicalAnd()                          = default;
+
+void NELogicalAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    _impl->kernel = arm_compute::support::cpp14::make_unique<kernels::NELogicalKernel>();
+    _impl->kernel->configure(input1->info(), input2->info(), output->info(), kernels::LogicalOperation::And);
+
+    _impl->pack = ITensorPack();
+    _impl->pack.add_tensor(TensorType::ACL_SRC_0, input1);
+    _impl->pack.add_tensor(TensorType::ACL_SRC_1, input2);
+    _impl->pack.add_tensor(TensorType::ACL_DST, output);
+}
+
+Status NELogicalAnd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return kernels::NELogicalKernel::validate(input1, input2, output, kernels::LogicalOperation::And);
+}
+
+void NELogicalAnd::run()
+{
+    NEScheduler::get().schedule_op(_impl->kernel.get(), Window::DimY, _impl->pack);
+}
+
+struct NELogicalOr::Impl : public LogicalArgs
+{
+};
+NELogicalOr::NELogicalOr()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NELogicalOr &NELogicalOr::operator=(NELogicalOr &&) = default;
+NELogicalOr::~NELogicalOr()                         = default;
+
+void NELogicalOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    _impl->kernel = arm_compute::support::cpp14::make_unique<kernels::NELogicalKernel>();
+    _impl->kernel->configure(input1->info(), input2->info(), output->info(), kernels::LogicalOperation::Or);
+
+    _impl->pack = ITensorPack();
+    _impl->pack.add_tensor(TensorType::ACL_SRC_0, input1);
+    _impl->pack.add_tensor(TensorType::ACL_SRC_1, input2);
+    _impl->pack.add_tensor(TensorType::ACL_DST, output);
+}
+
+Status NELogicalOr::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return kernels::NELogicalKernel::validate(input1, input2, output, kernels::LogicalOperation::Or);
+}
+
+void NELogicalOr::run()
+{
+    NEScheduler::get().schedule_op(_impl->kernel.get(), Window::DimY, _impl->pack);
+}
+
+struct NELogicalNot::Impl : public LogicalArgs
+{
+};
+NELogicalNot::NELogicalNot()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NELogicalNot &NELogicalNot::operator=(NELogicalNot &&) = default;
+NELogicalNot::~NELogicalNot()                          = default;
+
+void NELogicalNot::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _impl->kernel = arm_compute::support::cpp14::make_unique<kernels::NELogicalKernel>();
+    _impl->kernel->configure(input->info(), nullptr, output->info(), kernels::LogicalOperation::Not);
+
+    _impl->pack = ITensorPack();
+    _impl->pack.add_tensor(TensorType::ACL_SRC_0, input);
+    _impl->pack.add_tensor(TensorType::ACL_DST, output);
+}
+
+Status NELogicalNot::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return kernels::NELogicalKernel::validate(input, nullptr, output, kernels::LogicalOperation::Not);
+}
+
+void NELogicalNot::run()
+{
+    NEScheduler::get().schedule_op(_impl->kernel.get(), Window::DimY, _impl->pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
index 5ca672e1d6..06ed8d46c9 100644
--- a/src/runtime/NEON/functions/NEMagnitude.cpp
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp
@@ -23,13 +23,15 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
 
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEMagnitude::~NEMagnitude() = default;
 
 void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type)
 {
@@ -46,3 +48,4 @@ void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITenso
         _kernel = std::move(k);
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
index 9d3f34fba4..e8c9d09d95 100644
--- a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
@@ -25,9 +25,14 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
+#include "src/core/NEON/kernels/NEMemsetKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default;
+
 NEMaxUnpoolingLayer::NEMaxUnpoolingLayer()
 
     : _memset_kernel(), _unpooling_layer_kernel()
@@ -37,8 +42,10 @@ NEMaxUnpoolingLayer::NEMaxUnpoolingLayer()
 void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info)
 {
     const PixelValue zero_value(0.f);
-    _memset_kernel.configure(output, zero_value);
-    _unpooling_layer_kernel.configure(input, indices, output, pool_info);
+    _memset_kernel          = arm_compute::support::cpp14::make_unique<NEMemsetKernel>();
+    _unpooling_layer_kernel = arm_compute::support::cpp14::make_unique<NEMaxUnpoolingLayerKernel>();
+    _memset_kernel->configure(output, zero_value);
+    _unpooling_layer_kernel->configure(input, indices, output, pool_info);
 }
 
 Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
@@ -48,7 +55,7 @@ Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo
 
 void NEMaxUnpoolingLayer::run()
 {
-    NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_unpooling_layer_kernel, Window::DimY);
+    NEScheduler::get().schedule(_memset_kernel.get(), Window::DimY);
+    NEScheduler::get().schedule(_unpooling_layer_kernel.get(), Window::DimY);
 }
 } /* namespace arm_compute */
diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
index 57363f05ff..e073420114 100644
--- a/src/runtime/NEON/functions/NEMeanStdDev.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,13 @@
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEMeanStdDev::~NEMeanStdDev() = default;
 
 NEMeanStdDev::NEMeanStdDev()
     : _mean_stddev_kernel(), _fill_border_kernel(), _global_sum(0), _global_sum_squared(0)
@@ -34,8 +39,11 @@ NEMeanStdDev::NEMeanStdDev()
 
 void NEMeanStdDev::configure(IImage *input, float *mean, float *stddev)
 {
-    _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
-    _fill_border_kernel.configure(input, _mean_stddev_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
+    _mean_stddev_kernel = arm_compute::support::cpp14::make_unique<NEMeanStdDevKernel>();
+    _fill_border_kernel = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+
+    _mean_stddev_kernel->configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+    _fill_border_kernel->configure(input, _mean_stddev_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint8_t>(0)));
 }
 
 void NEMeanStdDev::run()
@@ -43,6 +51,7 @@ void NEMeanStdDev::run()
     _global_sum         = 0;
     _global_sum_squared = 0;
 
-    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimZ);
-    NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
+    NEScheduler::get().schedule(_fill_border_kernel.get(), Window::DimZ);
+    NEScheduler::get().schedule(_mean_stddev_kernel.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
index a88732b67d..d128c4456a 100644
--- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
@@ -23,11 +23,13 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
+#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEMeanStdDevNormalizationLayer::~NEMeanStdDevNormalizationLayer() = default;
+
 void NEMeanStdDevNormalizationLayer::configure(ITensor *input, ITensor *output, float epsilon)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEMeanStdDevNormalizationKernel>();
diff --git a/src/runtime/NEON/functions/NEMedian3x3.cpp b/src/runtime/NEON/functions/NEMedian3x3.cpp
index 2bbe8d39ae..b7b7c2cb47 100644
--- a/src/runtime/NEON/functions/NEMedian3x3.cpp
+++ b/src/runtime/NEON/functions/NEMedian3x3.cpp
@@ -23,18 +23,23 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
 
-#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEMedian3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEMedian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEMedian3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
index ca63937770..3c2219ca07 100644
--- a/src/runtime/NEON/functions/NEMinMaxLocation.cpp
+++ b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,12 @@
 #include "arm_compute/runtime/NEON/functions/NEMinMaxLocation.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEMinMaxLocationKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEMinMaxLocation::~NEMinMaxLocation() = default;
 
 NEMinMaxLocation::NEMinMaxLocation()
     : _min_max(), _min_max_loc()
@@ -34,17 +38,21 @@ NEMinMaxLocation::NEMinMaxLocation()
 
 void NEMinMaxLocation::configure(const IImage *input, void *min, void *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
 {
-    _min_max.configure(input, min, max);
-    _min_max_loc.configure(input, min, max, min_loc, max_loc, min_count, max_count);
+    _min_max = arm_compute::support::cpp14::make_unique<NEMinMaxKernel>();
+    _min_max->configure(input, min, max);
+
+    _min_max_loc = arm_compute::support::cpp14::make_unique<NEMinMaxLocationKernel>();
+    _min_max_loc->configure(input, min, max, min_loc, max_loc, min_count, max_count);
 }
 
 void NEMinMaxLocation::run()
 {
-    _min_max.reset();
+    _min_max->reset();
 
     /* Run min max kernel */
-    NEScheduler::get().schedule(&_min_max, Window::DimY);
+    NEScheduler::get().schedule(_min_max.get(), Window::DimY);
 
     /* Run min max location */
-    NEScheduler::get().schedule(&_min_max_loc, Window::DimY);
+    NEScheduler::get().schedule(_min_max_loc.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NENonLinearFilter.cpp b/src/runtime/NEON/functions/NENonLinearFilter.cpp
index b7c72acb9a..4d8fd00cbd 100644
--- a/src/runtime/NEON/functions/NENonLinearFilter.cpp
+++ b/src/runtime/NEON/functions/NENonLinearFilter.cpp
@@ -23,14 +23,15 @@
  */
 #include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
 
-#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NENonLinearFilterKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NENonLinearFilter::configure(ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
                                   BorderMode border_mode,
                                   uint8_t    constant_border_value)
@@ -38,5 +39,9 @@ void NENonLinearFilter::configure(ITensor *input, ITensor *output, NonLinearFilt
     auto k = arm_compute::support::cpp14::make_unique<NENonLinearFilterKernel>();
     k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
index 4d9edf7fc7..b8f5c251b7 100644
--- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
+++ b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
@@ -23,25 +23,29 @@
  */
 #include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
 
-#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode)
 {
     auto k = arm_compute::support::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
     k->configure(input, output, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
 
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
     if(border_mode != BorderMode::UNDEFINED)
     {
-        _border_handler.configure(input, BorderSize(1), BorderMode::CONSTANT, static_cast<float>(0.f));
+        b->configure(input, BorderSize(1), BorderMode::CONSTANT, static_cast<float>(0.f));
     }
     else
     {
-        _border_handler.configure(input, BorderSize(1), BorderMode::UNDEFINED, static_cast<float>(0.f));
+        b->configure(input, BorderSize(1), BorderMode::UNDEFINED, static_cast<float>(0.f));
     }
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index 10ee938335..dfc73b2a57 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -29,9 +29,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NENormalizationLayer::~NENormalizationLayer() = default;
+
 NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_f(), _input_squared()
 {
@@ -48,7 +52,8 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons
     _memory_group.manage(&_input_squared);
 
     // Configure kernels
-    _norm_kernel.configure(input, &_input_squared, output, norm_info);
+    _norm_kernel = arm_compute::support::cpp14::make_unique<NENormalizationLayerKernel>();
+    _norm_kernel->configure(input, &_input_squared, output, norm_info);
     _multiply_f.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
 
     // Allocate the tensor once the configure methods have been called
@@ -70,6 +75,6 @@ void NENormalizationLayer::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
     _multiply_f.run();
-    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+    NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY);
 }
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index c9e07483e6..565346bfce 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -25,7 +25,6 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
@@ -33,8 +32,13 @@
 #include "arm_compute/runtime/Pyramid.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NELKTrackerKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEOpticalFlow::~NEOpticalFlow() = default;
 
 NEOpticalFlow::NEOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
@@ -110,11 +114,12 @@ void NEOpticalFlow::configure(const Pyramid *old_pyramid, const Pyramid *new_pyr
         _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
 
         // Init Lucas-Kanade kernel
-        _kernel_tracker[i].configure(old_ith_input, new_ith_input, &_scharr_gx[i], &_scharr_gy[i],
-                                     old_points, new_points_estimates, new_points,
-                                     &_old_points_internal, &_new_points_internal,
-                                     termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
-                                     i, _num_levels, pyr_scale);
+        _kernel_tracker[i] = arm_compute::support::cpp14::make_unique<NELKTrackerKernel>();
+        _kernel_tracker[i]->configure(old_ith_input, new_ith_input, &_scharr_gx[i], &_scharr_gy[i],
+                                      old_points, new_points_estimates, new_points,
+                                      &_old_points_internal, &_new_points_internal,
+                                      termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
+                                      i, _num_levels, pyr_scale);
 
         _scharr_gx[i].allocator()->allocate();
         _scharr_gy[i].allocator()->allocate();
@@ -133,6 +138,7 @@ void NEOpticalFlow::run()
         _func_scharr[level - 1].run();
 
         // Run Lucas-Kanade kernel
-        NEScheduler::get().schedule(&_kernel_tracker[level - 1], Window::DimX);
+        NEScheduler::get().schedule(_kernel_tracker[level - 1].get(), Window::DimX);
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp
index f9393a4d92..00a1a4257a 100644
--- a/src/runtime/NEON/functions/NEPReluLayer.cpp
+++ b/src/runtime/NEON/functions/NEPReluLayer.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
+#include "src/core/NEON/kernels/NEElementwiseOperationKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index 21c349ba95..92659f39a2 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -27,6 +27,10 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -46,6 +50,8 @@ uint32_t last_padding_dimension(const PaddingList &padding)
 }
 } // namespace
 
+NEPadLayer::~NEPadLayer() = default;
+
 NEPadLayer::NEPadLayer()
     : _copy_kernel(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
 {
@@ -53,7 +59,8 @@ NEPadLayer::NEPadLayer()
 
 void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
 {
-    _pad_kernel.configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
+    _pad_kernel = arm_compute::support::cpp14::make_unique<NEPadLayerKernel>();
+    _pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
 }
 
 void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *output)
@@ -194,7 +201,8 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p
     else
     {
         // Copy the input to the whole output if no padding is applied
-        _copy_kernel.configure(input, output);
+        _copy_kernel = arm_compute::support::cpp14::make_unique<NECopyKernel>();
+        _copy_kernel->configure(input, output);
     }
 }
 
@@ -250,7 +258,7 @@ void NEPadLayer::run()
         {
             case PaddingMode::CONSTANT:
             {
-                NEScheduler::get().schedule(&_pad_kernel, Window::DimZ);
+                NEScheduler::get().schedule(_pad_kernel.get(), Window::DimZ);
                 break;
             }
             case PaddingMode::REFLECT:
@@ -279,7 +287,7 @@ void NEPadLayer::run()
     }
     else
     {
-        NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+        NEScheduler::get().schedule(_copy_kernel.get(), Window::DimY);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp
index 698add86b9..d2a115fdc8 100644
--- a/src/runtime/NEON/functions/NEPermute.cpp
+++ b/src/runtime/NEON/functions/NEPermute.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 
-#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
+#include "src/core/NEON/kernels/NEPermuteKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
index 85779611cd..3b6182a269 100644
--- a/src/runtime/NEON/functions/NEPhase.cpp
+++ b/src/runtime/NEON/functions/NEPhase.cpp
@@ -23,13 +23,13 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPhase.h"
 
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output, PhaseType phase_type)
 {
     if(phase_type == PhaseType::UNSIGNED)
@@ -45,3 +45,4 @@ void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *o
         _kernel = std::move(k);
     }
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 4208878b75..f7f4437554 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 81bd00d44d..12ac8d6d7d 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -25,8 +25,13 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NEPoolingLayer::~NEPoolingLayer() = default;
 
 NEPoolingLayer::NEPoolingLayer()
     : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW)
@@ -42,7 +47,8 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
     _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : pool_info.data_layout;
 
     // Configure pooling kernel
-    _pooling_layer_kernel.configure(input, output, pool_info, indices);
+    _pooling_layer_kernel = arm_compute::support::cpp14::make_unique<NEPoolingLayerKernel>();
+    _pooling_layer_kernel->configure(input, output, pool_info, indices);
 
     switch(_data_layout)
     {
@@ -55,7 +61,8 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
             {
                 zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
             }
-            _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
+            _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+            _border_handler->configure(input, _pooling_layer_kernel->border_size(), border_mode, zero_value);
             break;
         }
         case DataLayout::NHWC:
@@ -76,16 +83,18 @@ void NEPoolingLayer::run()
     {
         case DataLayout::NCHW:
             // Fill border
-            NEScheduler::get().schedule(&_border_handler, Window::DimY);
+            NEScheduler::get().schedule(_border_handler.get(), Window::DimY);
 
             // Run pooling layer
-            NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
+            NEScheduler::get().schedule(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY);
             break;
         case DataLayout::NHWC:
             // Run pooling layer
-            NEScheduler::get().schedule(&_pooling_layer_kernel, Window::DimX);
+            NEScheduler::get().schedule(_pooling_layer_kernel.get(), Window::DimX);
             break;
         default:
             ARM_COMPUTE_ERROR("Data layout not supported");
     }
 }
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
index fda130bf69..bfa06da04e 100644
--- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
+
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index cb45b647c7..1013730235 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -30,6 +30,16 @@
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -46,6 +56,31 @@ Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm
 }
 } // namespace
 
+Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
+{
+    // Output quantization scale will be different, but ignored here
+    // since it will be configured at configure() stage.
+    const TensorInfo out
+    {
+        in
+    };
+    return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
+}
+
+void NEQLSTMLayer::configure_layer_norm(NEQLSTMLayer::LayerNormGate g, const ITensor *in)
+{
+    ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
+
+    Tensor &out = get_layer_norm_output(g);
+    _memory_group.manage(&out);
+    out.allocator()->init(*(in->info()));
+
+    get_layer_norm(g) = arm_compute::support::cpp14::make_unique<NEQLSTMLayerNormalizationKernel>();
+    get_layer_norm(g)->configure(in, &out, get_layer_norm_weight(g), get_layer_norm_bias(g));
+}
+
+NEQLSTMLayer::TensorCopyKernel::~TensorCopyKernel() = default;
+
 Status NEQLSTMLayer::TensorCopyKernel::validate(const ITensorInfo &src, const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(src.tensor_shape().num_dimensions() > max_dimension_supported);
@@ -76,7 +111,21 @@ void NEQLSTMLayer::TensorCopyKernel::run()
     input_iter, output_iter);
 }
 
+NEQLSTMLayer::~NEQLSTMLayer() = default;
+
 NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(), _transpose_input_to_forget_weights(), _transpose_input_to_cell_weights(), _transpose_input_to_output_weights(), _transpose_input_to_input_weights(),
+      _transpose_recurrent_to_forget_weights(), _transpose_recurrent_to_cell_weights(), _transpose_recurrent_to_output_weights(), _transpose_recurrent_to_input_weights(), _transpose_projection_weights(),
+      _input_to_input_reduction(), _recurrent_to_input_reduction(), _input_to_forget_reduction(), _recurrent_to_forget_reduction(), _input_to_cell_reduction(), _recurrent_to_cell_reduction(),
+      _input_to_output_reduction(), _recurrent_to_output_reduction(), _projection_reduction(), _projection_bias_add(), _mm_input_to_forget(), _mm_recurrent_to_forget(), _pixelwise_mul_cell_to_forget(),
+      _input_to_forget_outstage(), _recurrent_to_forget_outstage(), _cell_to_forget_outstage(), _accumulate_input_recurrent_forget(), _accumulate_cell_forget(), _forget_gate_sigmoid(), _mm_input_to_cell(),
+      _input_to_cell_outstage(), _mm_recurrent_to_cell(), _recurrent_to_cell_outstage(), _accumulate_input_recurrent_modulation(), _cell_gate_tanh(), _input_gate_sub(), _mm_input_to_input(),
+      _input_to_input_outstage(), _mm_recurrent_to_input(), _recurrent_to_input_outstage(), _accumulate_input_recurrent_input(), _pixelwise_mul_cell_to_input(), _cell_to_input_outstage(),
+      _accumulate_cell_input(), _input_gate_sigmoid(), _pixelwise_mul_forget_cell(), _pixelwise_mul_input_cell(), _add_forget_cell(), _cell_clip(), _mm_input_to_output(), _input_to_output_outstage(),
+      _mm_recurrent_to_output(), _recurrent_to_output_outstage(), _accumulate_input_recurrent_output(), _pixelwise_mul_cell_to_output(), _cell_to_output_outstage(), _accumulate_cell_to_output(),
+      _output_gate_sigmoid(), _hidden_tanh(), _pixelwise_mul_hidden(), _hidden_outstage(), _mm_projection(), _projection_outstage(), _accumulate_projection(), _projection_clip(), _projection_bias_copy(),
+      _projection_output_to_accumulate_copy(), _projection_accumulate_to_output_copy(), _hidden_to_output_copy(), _layer_norms(), _copy_output(), _layer_norm_weights(), _layer_norm_bias(),
+      _layer_norm_output()
 {
     _memory_group = MemoryGroup(std::move(memory_manager));
 }
@@ -105,7 +154,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
                              const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
                              const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
                              const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                             const ITensor *cell_state_in, const ITensor *output_state_in,
+                             const ITensor *cell_state_in, ITensor *output_state_in,
                              ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
                              const LSTMParams<ITensor> &lstm_params)
 {
@@ -177,18 +226,29 @@ void NEQLSTMLayer::configure(const ITensor *input,
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction.configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction.configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+        _input_to_input_reduction     = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+        _recurrent_to_input_reduction = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+        _input_to_input_reduction->configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
-    _input_to_forget_reduction.configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction.configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_cell_reduction.configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction.configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_output_reduction.configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction.configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+
+    _input_to_forget_reduction     = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+    _recurrent_to_forget_reduction = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+    _input_to_cell_reduction       = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+    _recurrent_to_cell_reduction   = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+    _input_to_output_reduction     = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+    _recurrent_to_output_reduction = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+
+    _recurrent_to_cell_reduction->configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     if(_has_projection)
     {
-        _projection_reduction.configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        _projection_reduction = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+        _projection_reduction->configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
         if(_projection_bias != nullptr)
         {
             _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
@@ -477,9 +537,9 @@ void NEQLSTMLayer::configure(const ITensor *input,
         if(_projection_tensor_copy_required)
         {
             _hidden_gate.allocator()->allocate();
-            _projection_accumulate_res.allocator()->init(*output_state_out->info());
+            _projection_accumulate_res.allocator()->init(*output_state_in->info());
             _projection_accumulate_res.info()->set_tensor_shape(_projection_outstage_res.info()->tensor_shape());
-            _projection_output_to_accumulate_copy.configure(*output_state_out, _projection_accumulate_res);
+            _projection_output_to_accumulate_copy.configure(*output_state_in, _projection_accumulate_res);
             accumulate_destination = &_projection_accumulate_res;
         }
 
@@ -804,7 +864,8 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
     ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
-    gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
+    gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
+    gemmlowp_info.output_data_type = hidden_out_info.data_type();
     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
 
     const bool projection_tensor_copy_required = num_units != output_size;
@@ -834,7 +895,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
         if(projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_out, projection_outstage_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
         }
 
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
@@ -876,7 +937,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(output_state_out, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(output_state_out, output));
     return Status{};
 }
 
@@ -904,7 +965,7 @@ void NEQLSTMLayer::run()
 
     if(_has_layer_norm)
     {
-        NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Forget), Window::DimY);
+        NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY);
     }
 
     _forget_gate_sigmoid.run();
@@ -919,7 +980,7 @@ void NEQLSTMLayer::run()
 
     if(_has_layer_norm)
     {
-        NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Cell), Window::DimY);
+        NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY);
     }
 
     _cell_gate_tanh.run();
@@ -946,7 +1007,7 @@ void NEQLSTMLayer::run()
 
         if(_has_layer_norm)
         {
-            NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Input), Window::DimY);
+            NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY);
         }
 
         _input_gate_sigmoid.run();
@@ -977,7 +1038,7 @@ void NEQLSTMLayer::run()
 
     if(_has_layer_norm)
     {
-        NEScheduler::get().schedule(&get_layer_norm(LayerNormGate::Output), Window::DimY);
+        NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY);
     }
 
     _output_gate_sigmoid.run();
@@ -1019,7 +1080,7 @@ void NEQLSTMLayer::run()
     }
 
     // Copy output_state_out to output
-    NEScheduler::get().schedule(&_copy_output, Window::DimY);
+    _copy_output.run();
 }
 
 void NEQLSTMLayer::prepare()
@@ -1049,8 +1110,8 @@ void NEQLSTMLayer::prepare()
         {
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
-            NEScheduler::get().schedule(&_input_to_input_reduction, Window::DimY);
-            NEScheduler::get().schedule(&_recurrent_to_input_reduction, Window::DimY);
+            NEScheduler::get().schedule(_input_to_input_reduction.get(), Window::DimY);
+            NEScheduler::get().schedule(_recurrent_to_input_reduction.get(), Window::DimY);
 
             _input_to_input_weights_transposed.allocator()->allocate();
             _recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1065,17 +1126,17 @@ void NEQLSTMLayer::prepare()
         _recurrent_to_cell_eff_bias.allocator()->allocate();
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
-        NEScheduler::get().schedule(&_input_to_forget_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_recurrent_to_forget_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_input_to_cell_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_recurrent_to_cell_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_input_to_output_reduction, Window::DimY);
-        NEScheduler::get().schedule(&_recurrent_to_output_reduction, Window::DimY);
+        NEScheduler::get().schedule(_input_to_forget_reduction.get(), Window::DimY);
+        NEScheduler::get().schedule(_recurrent_to_forget_reduction.get(), Window::DimY);
+        NEScheduler::get().schedule(_input_to_cell_reduction.get(), Window::DimY);
+        NEScheduler::get().schedule(_recurrent_to_cell_reduction.get(), Window::DimY);
+        NEScheduler::get().schedule(_input_to_output_reduction.get(), Window::DimY);
+        NEScheduler::get().schedule(_recurrent_to_output_reduction.get(), Window::DimY);
 
         if(_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            NEScheduler::get().schedule(&_projection_reduction, Window::DimY);
+            NEScheduler::get().schedule(_projection_reduction.get(), Window::DimY);
             if(_projection_bias != nullptr)
             {
                 _projection_bias_add.run();
@@ -1104,5 +1165,4 @@ void NEQLSTMLayer::prepare()
         _is_prepared = true;
     }
 }
-
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index c042705a72..a20ffb8858 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEQuantizationLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index b7415bd44c..a8e10482a7 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -30,9 +30,24 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "src/core/NEON/kernels/NECopyKernel.h"
+#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NERNNLayer::~NERNNLayer() = default;
+
 NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
       _is_prepared(false)
@@ -99,7 +114,8 @@ void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const I
     _activation.configure(&_add_output, hidden_state, info);
     _add_output.allocator()->allocate();
 
-    _copy_kernel.configure(hidden_state, output);
+    _copy_kernel = arm_compute::support::cpp14::make_unique<NECopyKernel>();
+    _copy_kernel->configure(hidden_state, output);
 }
 
 void NERNNLayer::run()
@@ -116,7 +132,7 @@ void NERNNLayer::run()
     _activation.run();
 
     // copy hidden out to output
-    NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+    NEScheduler::get().schedule(_copy_kernel.get(), Window::DimY);
 }
 
 void NERNNLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
index a3b116a55e..a046140551 100644
--- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
@@ -23,7 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index 4aecadbc09..8bcf152881 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,14 @@
 #include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEROIPoolingLayer::~NEROIPoolingLayer() = default;
+
 NEROIPoolingLayer::NEROIPoolingLayer()
     : _roi_kernel()
 {
@@ -36,11 +39,12 @@ NEROIPoolingLayer::NEROIPoolingLayer()
 
 void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
 {
-    _roi_kernel.configure(input, rois, output, pool_info);
+    _roi_kernel = arm_compute::support::cpp14::make_unique<NEROIPoolingLayerKernel>();
+    _roi_kernel->configure(input, rois, output, pool_info);
 }
 
 void NEROIPoolingLayer::run()
 {
-    NEScheduler::get().schedule(&_roi_kernel, Window::DimX);
+    NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX);
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
index 138b458fab..ba166b2d58 100644
--- a/src/runtime/NEON/functions/NERange.cpp
+++ b/src/runtime/NEON/functions/NERange.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,13 @@
 #include "arm_compute/runtime/NEON/functions/NERange.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NERangeKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NERange::~NERange() = default;
+
 NERange::NERange()
     : _kernel()
 {
@@ -34,7 +38,8 @@ NERange::NERange()
 
 void NERange::configure(ITensor *output, const float start, const float end, const float step)
 {
-    _kernel.configure(output, start, end, step);
+    _kernel = arm_compute::support::cpp14::make_unique<NERangeKernel>();
+    _kernel->configure(output, start, end, step);
 }
 
 Status NERange::validate(const ITensorInfo *output, const float start, const float end, const float step)
@@ -44,6 +49,6 @@ Status NERange::validate(const ITensorInfo *output, const float start, const flo
 
 void NERange::run()
 {
-    NEScheduler::get().schedule(&_kernel, Window::DimX);
+    NEScheduler::get().schedule(_kernel.get(), Window::DimX);
 }
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 079c7c64bd..b50a925f44 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -23,23 +23,18 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
 namespace
 {
-} // namespace
-
-NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
-{
-}
-
 Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(keep_dims);
@@ -89,10 +84,26 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
         }
         const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+        const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
+        if(requant)
+        {
+            TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32));
+            NEDequantizationLayer::validate(input, &input_no_quant);
+            TensorInfo output_no_quant(output->clone()->set_data_type(DataType::F32));
+            NEQuantizationLayer::validate(&output_no_quant, output);
+        }
     }
     return Status{};
 }
+} // namespace
+
+NEReduceMean::~NEReduceMean() = default;
+
+NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(),
+      _output_no_quant()
+{
+}
 
 Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
@@ -104,33 +115,49 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
+    _do_requant    = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info();
     _reduction_ops = reduction_axis.num_dimensions();
     _reduction_kernels.resize(_reduction_ops);
     _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
     _keep_dims = keep_dims;
 
+    ITensor *tmp_input  = input;
+    ITensor *tmp_output = output;
+    if(_do_requant)
+    {
+        _memory_group.manage(&_input_no_quant);
+        _memory_group.manage(&_output_no_quant);
+        TensorInfo output_no_quant_info = input->info()->clone()->set_tensor_shape(output_shape);
+        output_no_quant_info.set_data_type(DataType::F32);
+        auto_init_if_empty(*_output_no_quant.info(), output_no_quant_info);
+        auto_init_if_empty(*_input_no_quant.info(), input->info()->clone()->set_data_type(DataType::F32));
+        _dequant.configure(input, &_input_no_quant);
+        tmp_input  = &_input_no_quant;
+        tmp_output = &_output_no_quant;
+    }
+
     Coordinates axis_local = reduction_axis;
-    const int   input_dims = input->info()->num_dimensions();
+    const int   input_dims = tmp_input->info()->num_dimensions();
 
     convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
     for(int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+        TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
-        auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+        auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
 
         if(i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info()));
             _memory_group.manage(&_reduced_outs[i]);
             _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
         }
@@ -145,7 +172,7 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
     // Configure reshape layer if we want to drop the dimensions
     if(!keep_dims)
     {
-        TensorShape out_shape = input->info()->tensor_shape();
+        TensorShape out_shape = tmp_input->info()->tensor_shape();
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
@@ -153,22 +180,35 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
         {
             out_shape.remove_dimension(axis_local[i] - i);
         }
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-        _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+        auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
+        _reshape.configure(&_reduced_outs[_reduction_ops - 1], tmp_output);
+    }
+    if(_do_requant)
+    {
+        _requant.configure(&_output_no_quant, output);
+        _input_no_quant.allocator()->allocate();
+        _output_no_quant.allocator()->allocate();
     }
 }
 
 void NEReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
+    if(_do_requant)
+    {
+        _dequant.run();
+    }
     for(auto &kernel : _reduction_kernels)
     {
         kernel.run();
     }
-
     if(!_keep_dims)
     {
         _reshape.run();
     }
+    if(_do_requant)
+    {
+        _requant.run();
+    }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 853d0ed0ad..463b65ec28 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -26,6 +26,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -53,8 +56,10 @@ size_t reduction_window_split_dimension(unsigned int axis)
 }
 } // namespace
 
+NEReductionOperation::~NEReductionOperation() = default;
+
 NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reduction_kernel(), _fill_border_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
+    : _memory_group(memory_manager), _reduction_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
 {
 }
 
@@ -124,51 +129,11 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i
     ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims));
 
     // Configure reduction kernel
-    _reduction_kernel.configure(input, output_internal, axis, op);
+    _reduction_kernel = arm_compute::support::cpp14::make_unique<NEReductionOperationKernel>();
+    _reduction_kernel->configure(input, output_internal, axis, op);
     _window_split   = reduction_window_split_dimension(axis);
     _reduction_axis = axis;
 
-    if(axis == 0)
-    {
-        // Configure fill border kernel
-        const BorderSize fill_border_size = _reduction_kernel.border_size();
-        PixelValue       pixelValue;
-        switch(op)
-        {
-            case ReductionOperation::PROD:
-            {
-                pixelValue = PixelValue(1, input->info()->data_type(), input->info()->quantization_info());
-                break;
-            }
-            case ReductionOperation::MIN:
-            {
-                pixelValue = std::get<1>(get_min_max(input->info()->data_type()));
-                break;
-            }
-            case ReductionOperation::MAX:
-            {
-                pixelValue = std::get<0>(get_min_max(input->info()->data_type()));
-                break;
-            }
-            case ReductionOperation::ARG_IDX_MAX:
-            case ReductionOperation::ARG_IDX_MIN:
-            {
-                pixelValue = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
-                break;
-            }
-            case ReductionOperation::MEAN_SUM:
-            case ReductionOperation::SUM_SQUARE:
-            case ReductionOperation::SUM:
-            {
-                pixelValue = PixelValue(static_cast<uint32_t>(0));
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Reduction Operation unsupported");
-        }
-        _fill_border_kernel.configure(input, fill_border_size, (is_arg_min_max ? BorderMode::REPLICATE : BorderMode::CONSTANT), pixelValue);
-    }
-
     if(_is_reshape_required)
     {
         _reshape.configure(output_internal, output);
@@ -178,11 +143,8 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i
 
 void NEReductionOperation::run()
 {
-    if(_reduction_axis == 0)
-    {
-        NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
-    }
-    NEScheduler::get().schedule(&_reduction_kernel, _window_split);
+    MemoryGroupResourceScope scope_mg(_memory_group);
+    NEScheduler::get().schedule(_reduction_kernel.get(), _window_split);
     if(_is_reshape_required)
     {
         _reshape.run();
diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
index d4e7f838c6..9276d49cf5 100644
--- a/src/runtime/NEON/functions/NERemap.cpp
+++ b/src/runtime/NEON/functions/NERemap.cpp
@@ -25,17 +25,18 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NERemapKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -45,9 +46,11 @@ void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map
     ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
 
     auto k = arm_compute::support::cpp14::make_unique<NERemapKernel>();
-
     k->configure(input, map_x, map_y, output, policy);
-
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp
index dfe002a503..77ec7fbfb1 100644
--- a/src/runtime/NEON/functions/NEReorgLayer.cpp
+++ b/src/runtime/NEON/functions/NEReorgLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h"
+#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index c1c88c1c7a..915d5d408f 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -23,10 +23,10 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Types.h"
+#include "src/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -35,6 +35,8 @@ namespace arm_compute
 {
 namespace experimental
 {
+NEReshape::~NEReshape() = default;
+
 void NEReshape::configure(const ITensorInfo *input, ITensorInfo *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEReshapeLayerKernel>();
diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
index c60c84e897..3ed0688386 100644
--- a/src/runtime/NEON/functions/NEReverse.cpp
+++ b/src/runtime/NEON/functions/NEReverse.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReverse.h"
 
-#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
+#include "src/core/NEON/kernels/NEReverseKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 424049f24a..0290fe5a01 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -30,12 +30,15 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEScaleKernel.h"
 
 #include "src/core/utils/ScaleUtils.h"
 
+#include "support/MemorySupport.h"
+#include "support/Rounding.h"
+
 #include <cmath>
 #include <cstddef>
 #include <utility>
@@ -44,7 +47,7 @@ namespace arm_compute
 {
 namespace
 {
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy, bool align_corners)
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == offsets);
     ARM_COMPUTE_UNUSED(sampling_policy);
@@ -72,7 +75,7 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float
             const int   in_xi = std::floor(in_x);
             const int   in_yi = std::floor(in_y);
 
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * static_cast<int>(input_element_size);
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
             *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
             *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
         },
@@ -85,23 +88,17 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float
 
         execute_window_loop(win, [&](const Coordinates & id)
         {
-            const float float_in_xi = (id.x() + sampling_offset) * wr;
-            const auto  in_xi       = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
-
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
+            const float float_in_xi                        = (id.x() + sampling_offset) * wr;
+            const auto  in_xi                              = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
         },
         offsets_it);
     }
 }
 } // namespace
 
-NEScale::NEScale() // NOLINT
-    : _offsets(),
-      _dx(),
-      _dy(),
-      _scale_kernel(),
-      _border_handler(),
-      _use_padding(true)
+NEScale::NEScale()
+    : _offsets(), _dx(), _dy()
 {
 }
 
@@ -110,7 +107,6 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), info));
 
-    _use_padding                     = info.use_padding;
     const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
 
     // Get data layout and width/height indices
@@ -119,18 +115,17 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Get the tensor shape
-    const TensorShape shape(output->info()->dimension(idx_width), output->info()->dimension(idx_height));
+    TensorShape shape(output->info()->dimension(idx_width));
+    shape.set(1, output->info()->dimension(idx_height), false);
 
     // Compute the ratio between source width/height and destination width/height
     const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
     const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
 
-    // Get the element size of the input image
-    const size_t input_element_size = input->info()->element_size();
-
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
     const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
 
+    auto scale_kernel = arm_compute::support::cpp14::make_unique<NEScaleKernel>();
     switch(policy_to_use)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
@@ -138,13 +133,13 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
             TensorInfo tensor_info_offsets(shape, Format::S32);
             _offsets.allocator()->init(tensor_info_offsets);
 
-            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, info);
+            scale_kernel->configure(input, nullptr, nullptr, &_offsets, output, info);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
 
             // Pre-compute offsets for nearest interpolation
-            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size, info.sampling_policy, is_align_corners_used);
+            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, info.sampling_policy, is_align_corners_used);
             break;
         }
         case InterpolationPolicy::BILINEAR:
@@ -156,7 +151,7 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
 
-            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, info);
+            scale_kernel->configure(input, &_dx, &_dy, &_offsets, output, info);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -164,27 +159,18 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
             _dy.allocator()->allocate();
 
             // Pre-compute dx, dy and offsets for bilinear interpolation
-            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size, info.sampling_policy, is_align_corners_used);
+            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, info.sampling_policy, is_align_corners_used);
             break;
         }
         case InterpolationPolicy::AREA:
         {
-            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, info);
+            scale_kernel->configure(input, nullptr, nullptr, nullptr, output, info);
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
-    if(info.use_padding)
-    {
-        _border_handler.configure(input, _scale_kernel.border_size(), info.border_mode, info.constant_border_value);
-    }
-}
-
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding,
-                        bool align_corners)
-{
-    configure(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners });
+    _kernel = std::move(scale_kernel);
 }
 
 Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
@@ -225,20 +211,4 @@ Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(), info));
     return Status{};
 }
-
-Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
-                         BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, bool align_corners)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(NEScale::validate(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners }));
-    return Status{};
-}
-
-void NEScale::run()
-{
-    if(_use_padding)
-    {
-        NEScheduler::get().schedule(&_border_handler, Window::DimZ);
-    }
-    NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
-}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEScharr3x3.cpp b/src/runtime/NEON/functions/NEScharr3x3.cpp
index bf787e1440..cea0eefdb0 100644
--- a/src/runtime/NEON/functions/NEScharr3x3.cpp
+++ b/src/runtime/NEON/functions/NEScharr3x3.cpp
@@ -23,8 +23,9 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
 
-#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEScharr3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -36,5 +37,8 @@ void NEScharr3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y
     auto k = arm_compute::support::cpp14::make_unique<NEScharr3x3Kernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
index 8def123c5d..0d1f490767 100644
--- a/src/runtime/NEON/functions/NESelect.cpp
+++ b/src/runtime/NEON/functions/NESelect.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NESelect.h"
 
-#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/NESelectKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
index 2bacf2ee2a..dd56eaba8b 100644
--- a/src/runtime/NEON/functions/NESlice.cpp
+++ b/src/runtime/NEON/functions/NESlice.cpp
@@ -24,10 +24,10 @@
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 
 #include "support/MemorySupport.h"
 
diff --git a/src/runtime/NEON/functions/NESobel3x3.cpp b/src/runtime/NEON/functions/NESobel3x3.cpp
index cfd68d70af..38d2dc227e 100644
--- a/src/runtime/NEON/functions/NESobel3x3.cpp
+++ b/src/runtime/NEON/functions/NESobel3x3.cpp
@@ -23,18 +23,23 @@
  */
 #include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
 
-#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
 #include "arm_compute/core/PixelValue.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NESobel3x3Kernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NESobel3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
 {
     auto k = arm_compute::support::cpp14::make_unique<NESobel3x3Kernel>();
     k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index 092c510bcf..e631fb3ed7 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,13 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NESobel5x5::~NESobel5x5() = default;
 
 NESobel5x5::NESobel5x5(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
@@ -46,14 +51,18 @@ void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
 
     TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16);
 
+    _sobel_hor      = arm_compute::support::cpp14::make_unique<NESobel5x5HorKernel>();
+    _sobel_vert     = arm_compute::support::cpp14::make_unique<NESobel5x5VertKernel>();
+    _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+
     if(run_sobel_x && run_sobel_y)
     {
         _tmp_x.allocator()->init(tensor_info);
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
         _tmp_y.allocator()->allocate();
     }
@@ -61,28 +70,29 @@ void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
     {
         _tmp_x.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
     }
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
     }
 
-    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 void NESobel5x5::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
-    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+    NEScheduler::get().schedule(_sobel_hor.get(), Window::DimY);
+    NEScheduler::get().schedule(_sobel_vert.get(), Window::DimY);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index 87ec81f7b0..bc5f87c1ec 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,13 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NESobel7x7Kernel.h"
+#include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+NESobel7x7::~NESobel7x7() = default;
 
 NESobel7x7::NESobel7x7(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
@@ -45,6 +50,9 @@ void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
     const bool run_sobel_y = output_y != nullptr;
 
     TensorInfo tensor_info(input->info()->tensor_shape(), Format::S32);
+    _sobel_hor      = arm_compute::support::cpp14::make_unique<NESobel7x7HorKernel>();
+    _sobel_vert     = arm_compute::support::cpp14::make_unique<NESobel7x7VertKernel>();
+    _border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
 
     if(run_sobel_x && run_sobel_y)
     {
@@ -52,8 +60,8 @@ void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
         _tmp_y.allocator()->allocate();
     }
@@ -61,28 +69,29 @@ void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y,
     {
         _tmp_x.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_x);
-        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
         _tmp_x.allocator()->allocate();
     }
     else if(run_sobel_y)
     {
         _tmp_y.allocator()->init(tensor_info);
         _memory_group.manage(&_tmp_y);
-        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
-        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_hor->configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert->configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
         _tmp_y.allocator()->allocate();
     }
 
-    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler->configure(input, _sobel_hor->border_size(), border_mode, PixelValue(constant_border_value));
 }
 
 void NESobel7x7::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
-    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+    NEScheduler::get().schedule(_sobel_hor.get(), Window::DimY);
+    NEScheduler::get().schedule(_sobel_vert.get(), Window::DimY);
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 750992fca6..e79ab0ee2d 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -24,47 +24,24 @@
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "src/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "src/core/helpers/SoftmaxHelpers.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 template <bool IS_LOG>
-NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_ptr(nullptr), _fill_border_kernel(), _reshape(), _max(), _tmp(), _input_flattened(), _output_flattened(),
-      _needs_flattening(false)
-{
-}
+NESoftmaxLayerGeneric<IS_LOG>::~NESoftmaxLayerGeneric() = default;
 
 template <bool IS_LOG>
-void NESoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, int32_t first_n_reduce_axes)
+NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _permute_input(), _permute_output(), _max_kernel(), _softmax_kernel(), _fill_border_kernel(), _max(), _tmp(), _input_permuted(), _output_permuted(),
+      _needs_permute(false)
 {
-    // Flatten the input
-    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), first_n_reduce_axes);
-
-    // Initialize the flat input
-    _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
-
-    // Note that the "other cases" include both:
-    //   1. first_n_reduce_axes < 3: Reduce the first 1 (no need to reduce) or 2 dimensions (inclusive)
-    //   2. first_n_reduce_axes == 4: Reduce all 4 dimensions. This can only be handled by NEReshapeKernel instead of NEFlattenKernel.
-    if(first_n_reduce_axes == 3)
-    {
-        auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayer>();
-        flatten_kernel_ptr->configure(input, &_input_flattened);
-        _flat_or_reshape_ptr = std::move(flatten_kernel_ptr);
-    }
-    else
-    {
-        auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayer>();
-        reshape_kernel_ptr->configure(input, &_input_flattened);
-        _flat_or_reshape_ptr = std::move(reshape_kernel_ptr);
-    }
-
-    // We need to init the output tensor here. Indeed, the reshape kernel expects
-    // both tensors to be already initialized
-    auto_init_if_empty(*output->info(), *input->info()->clone());
 }
 
 template <bool IS_LOG>
@@ -74,36 +51,29 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayerGeneric::validate(input->info(), output->info(), beta, axis));
 
-    // Convert reduce-before axis (inclusive) to first n axes to reduce
-    size_t first_n_reduce_axes = dim_index_2_num_dims(axis, static_cast<int32_t>(input->info()->num_dimensions()));
+    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(input->info()->num_dimensions())));
 
-    // We only need flattening when the number of axes to reduce is greater than 1
-    _needs_flattening = first_n_reduce_axes > 1;
+    _needs_permute = actual_axis > 0;
 
-    // If we are dealing with a 4D tensor, we will:
-    // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
-    // - Execute all the pipeline (reduction + normalization) on the flattened tensor
-    // - Reshape the flattened output into the real output
-    if(_needs_flattening)
+    if(_needs_permute)
     {
-        // Add to the memory manager _input_flattened
-        _memory_group.manage(&_input_flattened);
+        // Add to the memory manager _input_permuted
+        _memory_group.manage(&_input_permuted);
 
-        // Configure  _flatten_kernel and _input_flattened
-        configure_reshape_input_kernel(input, output, first_n_reduce_axes);
+        _permute_input.configure(input, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
     }
 
-    // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
+    // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case)
     // or it is the original input case (2D case)
-    ITensor *input_2D = (_needs_flattening ? &_input_flattened : input);
+    ITensor *tmp_input = (_needs_permute ? &_input_permuted : input);
 
     // Create intermediate tensors shapes
-    const TensorInfo input_info    = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
-    DataType         tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::F32 : input_2D->info()->data_type();
+    const TensorInfo input_info    = tmp_input->info()->clone()->reset_padding().set_is_resizable(true);
+    DataType         tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->info()->data_type()) ? DataType::F32 : tmp_input->info()->data_type();
     TensorInfo       tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
 
     // Init intermediate tensors
-    TensorShape max_sum_shape = input_2D->info()->tensor_shape();
+    TensorShape max_sum_shape = tmp_input->info()->tensor_shape();
     max_sum_shape.set(0, 1);
     _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
     _tmp.allocator()->init(tensor_info_tmp);
@@ -112,28 +82,31 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
     _memory_group.manage(&_max);
     _memory_group.manage(&_tmp);
 
-    // Configure Kernels
-    _max_kernel.configure(input_2D, &_max);
-    if(_needs_flattening)
+    // Configure kernels
+    _max_kernel     = arm_compute::support::cpp14::make_unique<NELogits1DMaxKernel>();
+    _softmax_kernel = arm_compute::support::cpp14::make_unique<NELogits1DSoftmaxKernel<IS_LOG>>();
+    _max_kernel->configure(tmp_input, &_max);
+    if(_needs_permute)
     {
-        // Add to the memory manager _output_flattened
-        _memory_group.manage(&_output_flattened);
+        // Add to the memory manager _output_permuted
+        _memory_group.manage(&_output_permuted);
 
-        // The normalization kernel stores the result in a flat output tensor
-        _softmax_kernel.configure(input_2D, &_max, &_output_flattened, beta, &_tmp);
-        _input_flattened.allocator()->allocate();
+        // The normalization kernel stores the result in a permuted output tensor
+        _softmax_kernel->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
+        _input_permuted.allocator()->allocate();
 
-        // Reshape the flat output into the requested (4D) output
-        _reshape.configure(&_output_flattened, output);
+        // Re-permute the permuted output into the requested (4D) output
+        _permute_output.configure(&_output_permuted, output, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
 
-        // Allocate the intermediate flat tensors
-        _output_flattened.allocator()->allocate();
+        // Allocate the intermediate permuted tensors
+        _output_permuted.allocator()->allocate();
     }
     else
     {
         // Softmax 2D case
-        _fill_border_kernel.configure(input_2D, _max_kernel.border_size(), BorderMode::REPLICATE);
-        _softmax_kernel.configure(input_2D, &_max, output, beta, &_tmp);
+        _fill_border_kernel = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+        _fill_border_kernel->configure(tmp_input, _max_kernel->border_size(), BorderMode::REPLICATE);
+        _softmax_kernel->configure(tmp_input, &_max, output, beta, &_tmp);
     }
 
     // Allocate intermediate buffers
@@ -148,12 +121,8 @@ Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
     ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 0, "Only axis 0 supported");
     ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-input->num_dimensions()) || static_cast<int32_t>(input->num_dimensions()) <= axis);
 
-    // Convert reduce-before axis (inclusive) to first n axes to reduce
-    size_t first_n_reduce_axes = dim_index_2_num_dims(axis, static_cast<int32_t>(input->num_dimensions()));
-
     // Create intermediate tensor info
     DataType         tmp_data_type = input->data_type();
     const TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
@@ -163,21 +132,18 @@ Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
     const TensorInfo tensor_info_max_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(input->quantization_info()).set_is_resizable(true));
     const TensorInfo dont_care;
 
-    const bool needs_flattening = (first_n_reduce_axes > 1);
+    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(input->num_dimensions())));
 
-    if(needs_flattening)
+    const bool needs_permute = actual_axis > 0;
+
+    if(needs_permute)
     {
-        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, first_n_reduce_axes);
-        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
-
-        if(first_n_reduce_axes == 3)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &tensor_info_flat));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(input, &tensor_info_flat));
-        }
+        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(*input, permutation_vector);
+        TensorInfo              input_permuted(input->clone()->set_tensor_shape(permuted_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(input, &input_permuted, permutation_vector));
+        TensorInfo output_permuted(output->clone()->set_tensor_shape(permuted_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(&output_permuted, output, permutation_vector));
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
@@ -191,18 +157,21 @@ void           NESoftmaxLayerGeneric<IS_LOG>::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_needs_flattening)
+    if(_needs_permute)
+    {
+        _permute_input.run();
+    }
+    else
     {
-        _flat_or_reshape_ptr->run();
+        NEScheduler::get().schedule(_fill_border_kernel.get(), Window::DimY);
     }
 
-    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_max_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_softmax_kernel, Window::DimY);
+    NEScheduler::get().schedule(_max_kernel.get(), Window::DimY);
+    NEScheduler::get().schedule(_softmax_kernel.get(), Window::DimY);
 
-    if(_needs_flattening)
+    if(_needs_permute)
     {
-        _reshape.run();
+        _permute_output.run();
     }
 }
 
diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
index 97e793f6fb..516e8d604c 100644
--- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,9 +29,14 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEMemsetKernel.h"
+#include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NESpaceToBatchLayer::~NESpaceToBatchLayer() = default;
+
 NESpaceToBatchLayer::NESpaceToBatchLayer()
     : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
 {
@@ -43,10 +48,12 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_s
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
-        _has_padding = true;
-        _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _has_padding   = true;
+        _memset_kernel = arm_compute::support::cpp14::make_unique<NEMemsetKernel>();
+        _memset_kernel->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+    _space_to_batch_kernel = arm_compute::support::cpp14::make_unique<NESpaceToBatchLayerKernel>();
+    _space_to_batch_kernel->configure(input, block_shape, paddings, output);
 }
 
 void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output)
@@ -55,10 +62,12 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
-        _has_padding = true;
-        _memset_kernel.configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _has_padding   = true;
+        _memset_kernel = arm_compute::support::cpp14::make_unique<NEMemsetKernel>();
+        _memset_kernel->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    _space_to_batch_kernel = arm_compute::support::cpp14::make_unique<NESpaceToBatchLayerKernel>();
+    _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
 Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
@@ -81,8 +90,8 @@ void NESpaceToBatchLayer::run()
     // Zero out output only if we have paddings
     if(_has_padding)
     {
-        NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+        NEScheduler::get().schedule(_memset_kernel.get(), Window::DimY);
     }
-    NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+    NEScheduler::get().schedule(_space_to_batch_kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
index 3e1ec80687..a834600199 100644
--- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,9 +29,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NESpaceToDepthLayer::~NESpaceToDepthLayer() = default;
+
 NESpaceToDepthLayer::NESpaceToDepthLayer()
     : _space_to_depth_kernel()
 {
@@ -40,7 +44,8 @@ NESpaceToDepthLayer::NESpaceToDepthLayer()
 void NESpaceToDepthLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _space_to_depth_kernel.configure(input, output, block_shape);
+    _space_to_depth_kernel = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernel>();
+    _space_to_depth_kernel->configure(input, output, block_shape);
 }
 
 Status NESpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
@@ -51,6 +56,6 @@ Status NESpaceToDepthLayer::validate(const ITensorInfo *input, const ITensorInfo
 
 void NESpaceToDepthLayer::run()
 {
-    NEScheduler::get().schedule(&_space_to_depth_kernel, Window::DimY);
+    NEScheduler::get().schedule(_space_to_depth_kernel.get(), Window::DimY);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index a99a95ab2a..e38ff6bee7 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -30,9 +30,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEStackLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEStackLayer::~NEStackLayer() = default;
+
 NEStackLayer::NEStackLayer() // NOLINT
     : _input(),
       _stack_kernels(),
@@ -50,7 +54,8 @@ void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITen
 
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
-        _stack_kernels[i].configure(input[i], axis_u, i, _num_inputs, output);
+        _stack_kernels[i] = arm_compute::support::cpp14::make_unique<NEStackLayerKernel>();
+        _stack_kernels[i]->configure(input[i], axis_u, i, _num_inputs, output);
     }
 }
 
@@ -80,7 +85,7 @@ void NEStackLayer::run()
 {
     for(unsigned i = 0; i < _num_inputs; i++)
     {
-        NEScheduler::get().schedule(&_stack_kernels[i], Window::DimY);
+        NEScheduler::get().schedule(_stack_kernels[i].get(), Window::DimY);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
index 8bf81e8270..308b856ec6 100644
--- a/src/runtime/NEON/functions/NEStridedSlice.cpp
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp
@@ -24,8 +24,8 @@
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NETableLookup.cpp b/src/runtime/NEON/functions/NETableLookup.cpp
index b8d765f76b..9295bf0ece 100644
--- a/src/runtime/NEON/functions/NETableLookup.cpp
+++ b/src/runtime/NEON/functions/NETableLookup.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETableLookup.h"
 
-#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+#include "src/core/NEON/kernels/NETableLookupKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEThreshold.cpp b/src/runtime/NEON/functions/NEThreshold.cpp
index e21511ed65..2f1e3047b5 100644
--- a/src/runtime/NEON/functions/NEThreshold.cpp
+++ b/src/runtime/NEON/functions/NEThreshold.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEThreshold.h"
 
-#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+#include "src/core/NEON/kernels/NEThresholdKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
index 6fda3a5ba6..6a1e20ddf8 100644
--- a/src/runtime/NEON/functions/NETile.cpp
+++ b/src/runtime/NEON/functions/NETile.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETile.h"
 
-#include "arm_compute/core/NEON/kernels/NETileKernel.h"
+#include "src/core/NEON/kernels/NETileKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 88d1672173..5af417f4ed 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
diff --git a/src/runtime/NEON/functions/NEUpsampleLayer.cpp b/src/runtime/NEON/functions/NEUpsampleLayer.cpp
index 58c050f904..aae58387e2 100644
--- a/src/runtime/NEON/functions/NEUpsampleLayer.cpp
+++ b/src/runtime/NEON/functions/NEUpsampleLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,13 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEUpsampleLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
+#include "src/core/NEON/kernels/NEUpsampleLayerKernel.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+NEUpsampleLayer::~NEUpsampleLayer() = default;
+
 NEUpsampleLayer::NEUpsampleLayer()
     : _kernel(), _data_layout()
 {
@@ -41,12 +44,13 @@ Status NEUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *ou
 void NEUpsampleLayer::configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy &policy)
 {
     _data_layout = input->info()->data_layout();
-    _kernel.configure(input, output, info, policy);
+    _kernel      = arm_compute::support::cpp14::make_unique<NEUpsampleLayerKernel>();
+    _kernel->configure(input, output, info, policy);
 }
 
 void NEUpsampleLayer::run()
 {
     const auto win = (_data_layout == DataLayout::NCHW) ? Window::DimZ : Window::DimX;
-    NEScheduler::get().schedule(&_kernel, win);
+    NEScheduler::get().schedule(_kernel.get(), win);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp
index ec2c6883ba..b5dbfe0d5c 100644
--- a/src/runtime/NEON/functions/NEWarpAffine.cpp
+++ b/src/runtime/NEON/functions/NEWarpAffine.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEWarpKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -58,5 +59,7 @@ void NEWarpAffine::configure(ITensor *input, ITensor *output, const std::array<f
             ARM_COMPUTE_ERROR("Interpolation type not supported");
     }
 
-    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, constant_border_value);
+    _border_handler = std::move(b);
 }
diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp
index bf361b8ab9..8d42121005 100644
--- a/src/runtime/NEON/functions/NEWarpPerspective.cpp
+++ b/src/runtime/NEON/functions/NEWarpPerspective.cpp
@@ -24,14 +24,15 @@
 #include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEWarpKernel.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEWarpPerspective::configure(ITensor *input, ITensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -58,5 +59,8 @@ void NEWarpPerspective::configure(ITensor *input, ITensor *output, const std::ar
             ARM_COMPUTE_ERROR("Interpolation type not supported");
     }
 
-    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+    auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+    b->configure(input, _kernel->border_size(), border_mode, constant_border_value);
+    _border_handler = std::move(b);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 1bad310640..1cb2458e13 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -23,17 +23,21 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 #include "support/MemorySupport.h"
 
-#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
+#include "src/core/NEON/kernels/convolution/common/utils.hpp"
 #include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEYOLOLayer.cpp b/src/runtime/NEON/functions/NEYOLOLayer.cpp
index 233afb727a..5cad53bffd 100644
--- a/src/runtime/NEON/functions/NEYOLOLayer.cpp
+++ b/src/runtime/NEON/functions/NEYOLOLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEYOLOLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
+#include "src/core/NEON/kernels/NEYOLOLayerKernel.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
index 73a7caac8b..11e89cb23b 100644
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
@@ -24,18 +24,21 @@
 
 #include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
 
-#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
+#include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
+#include "support/MemorySupport.h"
+
 #include <set>
 
 namespace arm_compute
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index 11448e595c..bf34b0114b 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -27,8 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CPUUtils.h"
-
+#include "src/runtime/CPUUtils.h"
 #include <omp.h>
 
 namespace arm_compute
@@ -51,36 +50,8 @@ void OMPScheduler::set_num_threads(unsigned int num_threads)
 
 void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
-    ARM_COMPUTE_ERROR_ON_MSG(hints.strategy() == StrategyHint::DYNAMIC,
-                             "Dynamic scheduling is not supported in OMPScheduler");
-
-    const Window      &max_window     = kernel->window();
-    const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
-    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
-
-    if(!kernel->is_parallelisable() || num_threads == 1)
-    {
-        ThreadInfo info;
-        info.cpu_info = &_cpu_info;
-        kernel->run(max_window, info);
-    }
-    else
-    {
-        const unsigned int                num_windows = num_threads;
-        std::vector<IScheduler::Workload> workloads(num_windows);
-        for(unsigned int t = 0; t < num_windows; t++)
-        {
-            //Capture 't' by copy, all the other variables by reference:
-            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
-            {
-                Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
-                win.validate();
-                kernel->run(win, info);
-            };
-        }
-        run_workloads(workloads);
-    }
+    ITensorPack tensors;
+    schedule_common(kernel, hints, tensors);
 }
 
 void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors)
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index 4063cc1c00..5b3010b173 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp
@@ -67,7 +67,7 @@ std::map<Scheduler::Type, std::unique_ptr<IScheduler>> init()
 }
 } // namespace
 
-std::map<Scheduler::Type, std::unique_ptr<IScheduler>> Scheduler::_schedulers = init();
+std::map<Scheduler::Type, std::unique_ptr<IScheduler>> Scheduler::_schedulers{};
 
 void Scheduler::set(Type t)
 {
@@ -107,6 +107,11 @@ IScheduler &Scheduler::get()
     }
     else
     {
+        if(_schedulers.empty())
+        {
+            _schedulers = init();
+        }
+
         auto it = _schedulers.find(_scheduler_type);
         if(it != _schedulers.end())
         {
diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp
new file mode 100644
index 0000000000..6f9a32c879
--- /dev/null
+++ b/src/runtime/SchedulerUtils.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/SchedulerUtils.h"
+
+#include "arm_compute/core/Error.h"
+
+#include <cmath>
+
+namespace arm_compute
+{
+namespace scheduler_utils
+{
+#ifndef BARE_METAL
+std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n)
+{
+    /*
+     * We want the same ratio of threads in M & N to the ratio of m and n problem size
+     *
+     * Therefore:    mt/nt == m/n    where mt*nt == max_threads
+     *
+     *             max_threads/nt = mt    &    (max_threads/nt) * (m/n) = nt
+     *          nt^2 = max_threads * (m/n)
+     *          nt = sqrt( max_threads * (m/n) )
+     */
+    //ratio of m to n in problem dimensions
+    double ratio = m / static_cast<double>(n);
+
+    // nt = sqrt(max_threads * (m / n) )
+    const unsigned adjusted = std::round(
+                                  std::sqrt(max_threads * ratio));
+
+    //find the nearest factor of max_threads
+    for(unsigned i = 0; i != adjusted; ++i)
+    {
+        //try down
+        const unsigned adj_down = adjusted - i;
+        if(max_threads % adj_down == 0)
+        {
+            return { adj_down, max_threads / adj_down };
+        }
+
+        //try up
+        const unsigned adj_up = adjusted + i;
+        if(max_threads % adj_up == 0)
+        {
+            return { adj_up, max_threads / adj_up };
+        }
+    }
+
+    //we didn't find anything so lets bail out with maxes biased to the largest dimension
+    if(m > n)
+    {
+        return { std::min<unsigned>(m, max_threads), 1 };
+    }
+    else
+    {
+        return { 1, std::min<unsigned>(n, max_threads) };
+    }
+}
+#endif /* #ifndef BARE_METAL */
+} // namespace scheduler_utils
+} // namespace arm_compute
diff --git a/src/runtime/SchedulerUtils.h b/src/runtime/SchedulerUtils.h
new file mode 100644
index 0000000000..46644a369e
--- /dev/null
+++ b/src/runtime/SchedulerUtils.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_COMPUTE_SCHEDULER_UTILS_H
+#define SRC_COMPUTE_SCHEDULER_UTILS_H
+
+#include <cstddef>
+#include <utility>
+
+namespace arm_compute
+{
+namespace scheduler_utils
+{
+/** Given two dimensions and a maximum number of threads to utilise, calculate the best
+ * combination of threads that fit in (multiplied together) max_threads.
+ *
+ * This algorithm assumes that work in either of the dimensions is equally difficult
+ * to compute
+ *
+ * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension
+ */
+std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n);
+} // namespace scheduler_utils
+} // namespace arm_compute
+#endif /* SRC_COMPUTE_SCHEDULER_UTILS_H */
diff --git a/src/runtime/TracePoint.cpp b/src/runtime/TracePoint.cpp
index a4228b2b21..6cb672c348 100644
--- a/src/runtime/TracePoint.cpp
+++ b/src/runtime/TracePoint.cpp
@@ -25,10 +25,10 @@
 #include <stdio.h>
 #include <vector>
 
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
 #include "arm_compute/runtime/Array.h"
 #include "arm_compute/runtime/Pyramid.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
+#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
 #include "utils/TypePrinter.h"
 
 namespace arm_compute
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
index 534b421f8a..15e9d43a49 100644
--- a/src/runtime/Utils.cpp
+++ b/src/runtime/Utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/Utils.h"
+#include "src/runtime/Utils.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
@@ -31,6 +31,8 @@
 
 namespace arm_compute
 {
+namespace utils
+{
 #ifndef DOXYGEN_SKIP_THIS
 static const std::string information =
 #include "arm_compute_version.embed"
@@ -78,4 +80,5 @@ unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, un
     const unsigned int num_of_stages = num_of_wg / 128 + 2;
     return num_of_stages;
 }
+} // namespace utils
 } // namespace arm_compute
diff --git a/arm_compute/runtime/Utils.h b/src/runtime/Utils.h
similarity index 92%
rename from arm_compute/runtime/Utils.h
rename to src/runtime/Utils.h
index 6e36297704..f8775c9612 100644
--- a/arm_compute/runtime/Utils.h
+++ b/src/runtime/Utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_RUNTIME_UTILS_H
-#define ARM_COMPUTE_RUNTIME_UTILS_H
+#ifndef SRC_RUNTIME_UTILS_H
+#define SRC_RUNTIME_UTILS_H
 
 #include "arm_compute/runtime/IRuntimeContext.h"
 #include "arm_compute/runtime/Scheduler.h"
@@ -31,6 +31,8 @@
 
 namespace arm_compute
 {
+namespace utils
+{
 /** Convert a Scheduler::Type into a string.
  *
  * @param[in] t @ref Scheduler::Type to be translated to string.
@@ -53,5 +55,6 @@ void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const ISch
  * @param[in] axis              axis to be used
  */
 unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis);
+} // namespace utils
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_RUNTIME_UTILS_H */
+#endif /* SRC_RUNTIME_UTILS_H */
diff --git a/arm_compute/core/utils/misc/CRTP.h b/support/CRTP.h
similarity index 97%
rename from arm_compute/core/utils/misc/CRTP.h
rename to support/CRTP.h
index d295500bef..eb358d0600 100644
--- a/arm_compute/core/utils/misc/CRTP.h
+++ b/support/CRTP.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/misc/Cast.h b/support/Cast.h
similarity index 98%
rename from arm_compute/core/utils/misc/Cast.h
rename to support/Cast.h
index 57c7e49942..53d5f68065 100644
--- a/arm_compute/core/utils/misc/Cast.h
+++ b/support/Cast.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/misc/ICloneable.h b/support/ICloneable.h
similarity index 97%
rename from arm_compute/core/utils/misc/ICloneable.h
rename to support/ICloneable.h
index cbb0b3c149..5d333c8442 100644
--- a/arm_compute/core/utils/misc/ICloneable.h
+++ b/support/ICloneable.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/misc/Iterable.h b/support/Iterable.h
similarity index 98%
rename from arm_compute/core/utils/misc/Iterable.h
rename to support/Iterable.h
index 34232088e8..a0bafaf4ce 100644
--- a/arm_compute/core/utils/misc/Iterable.h
+++ b/support/Iterable.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/misc/Random.h b/support/Random.h
similarity index 98%
rename from arm_compute/core/utils/misc/Random.h
rename to support/Random.h
index 6832c495e3..c8b767e505 100644
--- a/arm_compute/core/utils/misc/Random.h
+++ b/support/Random.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/misc/Requires.h b/support/Requires.h
similarity index 97%
rename from arm_compute/core/utils/misc/Requires.h
rename to support/Requires.h
index ba91039596..bc4932adc5 100644
--- a/arm_compute/core/utils/misc/Requires.h
+++ b/support/Requires.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/misc/Rounding.h b/support/Rounding.h
similarity index 98%
rename from arm_compute/core/utils/misc/Rounding.h
rename to support/Rounding.h
index 1ed4e64886..ba9266d323 100644
--- a/arm_compute/core/utils/misc/Rounding.h
+++ b/support/Rounding.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,8 @@
 #define ARM_COMPUTE_UTILS_ROUNDING_H
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/Requires.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+#include "support/Requires.h"
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
diff --git a/arm_compute/core/utils/misc/SaturateCast.h b/support/SaturateCast.h
similarity index 98%
rename from arm_compute/core/utils/misc/SaturateCast.h
rename to support/SaturateCast.h
index cbced83f89..a9982d8e96 100644
--- a/arm_compute/core/utils/misc/SaturateCast.h
+++ b/support/SaturateCast.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H
 #define ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H
 
-#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+#include "support/Rounding.h"
 
 namespace arm_compute
 {
diff --git a/support/Traits.h b/support/Traits.h
new file mode 100644
index 0000000000..e892c631f7
--- /dev/null
+++ b/support/Traits.h
@@ -0,0 +1,47 @@
+/*
+* Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SUPPORT_TRAITS_H
+#define SUPPORT_TRAITS_H
+
+#include <type_traits>
+
+namespace arm_compute
+{
+/** Disable bitwise operations by default */
+template <typename T>
+struct enable_bitwise_ops
+{
+    static constexpr bool value = false; /**< Disabled */
+};
+
+#ifndef DOXYGEN_SKIP_THIS
+template <typename T>
+typename std::enable_if<enable_bitwise_ops<T>::value, T>::type operator&(T lhs, T rhs)
+{
+    using underlying_type = typename std::underlying_type<T>::type;
+    return static_cast<T>(static_cast<underlying_type>(lhs) & static_cast<underlying_type>(rhs));
+}
+#endif /* DOXYGEN_SKIP_THIS */
+} // namespace arm_compute
+#endif /* SUPPORT_TRAITS_H */
diff --git a/tests/AssetsLibrary.h b/tests/AssetsLibrary.h
index d783b1f207..28a51d6ae9 100644
--- a/tests/AssetsLibrary.h
+++ b/tests/AssetsLibrary.h
@@ -31,7 +31,7 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Random.h"
+#include "support/Random.h"
 #include "tests/RawTensor.h"
 #include "tests/TensorCache.h"
 #include "tests/Utils.h"
diff --git a/tests/CL/Helper.h b/tests/CL/Helper.h
index e0d584c5ce..e548af4938 100644
--- a/tests/CL/Helper.h
+++ b/tests/CL/Helper.h
@@ -24,13 +24,15 @@
 #ifndef ARM_COMPUTE_TEST_CL_HELPER_H
 #define ARM_COMPUTE_TEST_CL_HELPER_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLMemsetKernel.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/IFunction.h"
+
+#include "src/core/CL/ICLKernel.h"
+
 #include "support/MemorySupport.h"
 
 namespace arm_compute
@@ -93,7 +95,7 @@ class CLSynthetizeFunctionWithZeroConstantBorder : public ICLSimpleFunction
         auto k = arm_compute::support::cpp14::make_unique<K>();
         k->configure(first, std::forward<Args>(args)...);
         _kernel = std::move(k);
-        _border_handler.configure(first, BorderSize(bordersize), BorderMode::CONSTANT, PixelValue());
+        _border_handler->configure(first, BorderSize(bordersize), BorderMode::CONSTANT, PixelValue());
     }
 };
 
diff --git a/tests/GLES_COMPUTE/GCAccessor.h b/tests/GLES_COMPUTE/GCAccessor.h
index 2a8733cbc5..65df0a5ddc 100644
--- a/tests/GLES_COMPUTE/GCAccessor.h
+++ b/tests/GLES_COMPUTE/GCAccessor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,8 +50,8 @@ class GCAccessor : public IAccessor
     GCAccessor &operator=(const GCAccessor &) = delete;
     /** Allow instances of this class to be move constructed */
     GCAccessor(GCAccessor &&) = default;
-    /** Allow instances of this class to be moved */
-    GCAccessor &operator=(GCAccessor &&) = default;
+    /** Prevent instances of this class to be moved */
+    GCAccessor &operator=(GCAccessor &&) = delete;
 
     /** Destructor that unmaps the GLES memory. */
     ~GCAccessor();
diff --git a/tests/NEON/Helper.h b/tests/NEON/Helper.h
index d1ae37ec78..ea47a416b1 100644
--- a/tests/NEON/Helper.h
+++ b/tests/NEON/Helper.h
@@ -26,6 +26,8 @@
 
 #include "arm_compute/runtime/Array.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "support/MemorySupport.h"
 #include "tests/Globals.h"
 
@@ -52,7 +54,7 @@ void fill_tensors(D &&dist, std::initializer_list<int> seeds, T &&tensor, Ts &&.
 
 /** This template synthetizes an INESimpleFunction which runs the given kernel K */
 template <typename K>
-class NESynthetizeFunction : public INESimpleFunction
+class NESynthetizeFunction : public INESimpleFunctionNoBorder
 {
 public:
     /** Configure the kernel.
@@ -93,7 +95,10 @@ class NESynthetizeFunctionWithZeroConstantBorder : public INESimpleFunction
         auto k = arm_compute::support::cpp14::make_unique<K>();
         k->configure(first, std::forward<Args>(args)...);
         _kernel = std::move(k);
-        _border_handler.configure(first, BorderSize(bordersize), BorderMode::CONSTANT, PixelValue());
+
+        auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+        b->configure(first, BorderSize(bordersize), BorderMode::CONSTANT, PixelValue());
+        _border_handler = std::move(b);
     }
 };
 
@@ -113,7 +118,10 @@ class NESynthetizeFunctionWithZeroConstantKernelBorder : public INESimpleFunctio
         auto k = arm_compute::support::cpp14::make_unique<K>();
         k->configure(first, std::forward<Args>(args)...);
         _kernel = std::move(k);
-        _border_handler.configure(first, BorderSize(_kernel->border_size()), BorderMode::CONSTANT, PixelValue());
+
+        auto b = arm_compute::support::cpp14::make_unique<NEFillBorderKernel>();
+        b->configure(first, BorderSize(_kernel->border_size()), BorderMode::CONSTANT, PixelValue());
+        _border_handler = std::move(b);
     }
 };
 
diff --git a/tests/SConscript b/tests/SConscript
index 73f5ad716f..b625f032bc 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -110,7 +110,6 @@ if env['opencl']:
 
     files_benchmark += Glob('benchmark/CL/*/' + filter_pattern)
     files_benchmark += Glob('benchmark/CL/' + filter_pattern)
-
     files_validation += Glob('validation/CL/*/' + filter_pattern)
     files_validation += Glob('validation/CL/' + filter_pattern)
 
@@ -118,7 +117,6 @@ if env['neon']:
     filter_pattern = test_env['test_filter']
     files_benchmark += Glob('benchmark/NEON/*/' + filter_pattern)
     files_benchmark += Glob('benchmark/NEON/' + filter_pattern)
-
     files_validation += Glob('validation/NEON/' + filter_pattern)
     if env['os'] == 'bare_metal':
         files_validation += Glob('validation/NEON/UNIT/MemoryManager.cpp' + filter_pattern)
@@ -137,10 +135,12 @@ if env['gles_compute']:
     files_validation += Glob('validation/GLES_COMPUTE/*/*.cpp')
     files_validation += Glob('validation/GLES_COMPUTE/*.cpp')
 
+extra_link_flags = []
 if env['os'] == 'android':
     test_env.Append(LIBS = ["log"])
 elif env['os'] != 'bare_metal':
     test_env.Append(LIBS = ["rt"])
+    extra_link_flags += ['-fstack-protector-strong']
 
 if test_env['benchmark_tests']:
     arm_compute_benchmark = test_env.Program('arm_compute_benchmark', files_benchmark + common_objects)
@@ -150,7 +150,7 @@ if test_env['benchmark_tests']:
     Default(arm_compute_benchmark)
     Export('arm_compute_benchmark')
 
-bm_link_flags = ['-fstack-protector-strong']
+bm_link_flags = []
 if test_env['linker_script']:
     bm_link_flags += ['-Wl,--build-id=none', '-T', env['linker_script']]
 
@@ -202,7 +202,7 @@ if test_env['validation_tests']:
                 for file in Glob("validate_examples/graph_*.cpp"):
                     example = "validate_" + os.path.basename(os.path.splitext(str(file))[0])
                     if env['os'] in ['android', 'bare_metal'] or env['standalone']:
-                        prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils]+ files_validate_examples, LIBS = test_env["LIBS"] + [ arm_compute_validation_framework ], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--whole-archive',arm_compute_lib,'-Wl,--no-whole-archive'] + bm_link_flags)
+                        prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils]+ files_validate_examples, LIBS = test_env["LIBS"] + [ arm_compute_validation_framework ], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--whole-archive',arm_compute_lib,'-Wl,--no-whole-archive'] + bm_link_flags + extra_link_flags)
                         arm_compute_validate_examples += [ prog ]
                     else:
                         #-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies
@@ -251,7 +251,7 @@ if test_env['benchmark_examples']:
         for file in Glob("%s/graph_*.cpp" % examples_folder ):
             example = "benchmark_" + os.path.basename(os.path.splitext(str(file))[0])
             if env['os'] in ['android', 'bare_metal'] or env['standalone']:
-                prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--whole-archive',arm_compute_lib,'-Wl,--no-whole-archive'] + bm_link_flags)
+                prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--whole-archive',arm_compute_lib,'-Wl,--no-whole-archive'] + bm_link_flags + extra_link_flags)
                 arm_compute_benchmark_examples += [ prog ]
             else:
                 #-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies
diff --git a/tests/benchmark/CL/Scale.cpp b/tests/benchmark/CL/Scale.cpp
index 58727edcae..8a1ceb663e 100644
--- a/tests/benchmark/CL/Scale.cpp
+++ b/tests/benchmark/CL/Scale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLScale.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/benchmark/fixtures/ScaleFixture.h"
 #include "tests/datasets/BorderModeDataset.h"
diff --git a/tests/benchmark/fixtures/ScaleFixture.h b/tests/benchmark/fixtures/ScaleFixture.h
index 1fea66fbe2..953872ea64 100644
--- a/tests/benchmark/fixtures/ScaleFixture.h
+++ b/tests/benchmark/fixtures/ScaleFixture.h
@@ -75,7 +75,7 @@ class ScaleFixture : public framework::Fixture
         dst = create_tensor<TensorType>(shape_scaled, data_type);
 
         // Create and configure function
-        scale_func.configure(&src, &dst, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy });
+        scale_func.configure(&src, &dst, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, false });
 
         // Allocate tensors
         src.allocator()->allocate();
diff --git a/tests/datasets/BorderModeDataset.h b/tests/datasets/BorderModeDataset.h
index 84a7a4cfb1..bb90ad2214 100644
--- a/tests/datasets/BorderModeDataset.h
+++ b/tests/datasets/BorderModeDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,8 +27,6 @@
 #include "arm_compute/core/Types.h"
 #include "tests/framework/datasets/ContainerDataset.h"
 
-#include "utils/TypePrinter.h"
-
 namespace arm_compute
 {
 namespace test
diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h
index 5d516b5dd1..ed596d6d45 100644
--- a/tests/datasets/DepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -162,7 +162,7 @@ class SmallDepthwiseConvolutionLayerDataset3x3 final : public DepthwiseConvoluti
     SmallDepthwiseConvolutionLayerDataset3x3()
     {
         add_config(TensorShape(3U, 3U, 2U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0));
-        add_config(TensorShape(7U, 7U, 3U, 2U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0));
+        add_config(TensorShape(7U, 8U, 3U, 2U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(21U, 31U, 9U, 4U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 0));
         // Asymmetric padding
         add_config(TensorShape(33U, 27U, 11U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
diff --git a/tests/datasets/LocallyConnectedDataset.h b/tests/datasets/LocallyConnectedDataset.h
deleted file mode 100644
index 5d2017dba7..0000000000
--- a/tests/datasets/LocallyConnectedDataset.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_LOCALLYCONNECTED_DATASET
-#define ARM_COMPUTE_TEST_LOCALLYCONNECTED_DATASET
-
-#include "utils/TypePrinter.h"
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-
-#include "tests/datasets/ConvolutionLayerDataset.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace datasets
-{
-class SmallLocallyConnectedDataset final : public ConvolutionLayerDataset
-{
-public:
-    SmallLocallyConnectedDataset()
-    {
-        // Batch size 1
-        add_config(TensorShape(23U, 27U, 5U), TensorShape(3U, 3U, 5U, 21U, 275U), TensorShape(21U, 275U), TensorShape(11U, 25U, 21U), PadStrideInfo(2, 1, 0, 0));
-        add_config(TensorShape(17U, 31U, 2U), TensorShape(5U, 5U, 2U, 19U, 225U), TensorShape(19U, 225U), TensorShape(15U, 15U, 19U), PadStrideInfo(1, 2, 1, 1));
-        add_config(TensorShape(17U, 31U, 2U), TensorShape(5U, 3U, 2U, 19U, 240U), TensorShape(19U, 240U), TensorShape(15U, 16U, 19U), PadStrideInfo(1, 2, 1, 1));
-        // Batch size 4
-        add_config(TensorShape(23U, 27U, 5U, 4U), TensorShape(3U, 3U, 5U, 21U, 275U), TensorShape(21U, 275U), TensorShape(11U, 25U, 21U, 4U), PadStrideInfo(2, 1, 0, 0));
-        add_config(TensorShape(17U, 31U, 2U, 4U), TensorShape(5U, 5U, 2U, 19U, 225U), TensorShape(19U, 225U), TensorShape(15U, 15U, 19U, 4U), PadStrideInfo(1, 2, 1, 1));
-        add_config(TensorShape(17U, 31U, 2U, 4U), TensorShape(5U, 3U, 2U, 19U, 240U), TensorShape(19U, 240U), TensorShape(15U, 16U, 19U, 4U), PadStrideInfo(1, 2, 1, 1));
-        // FC convolution
-        add_config(TensorShape(1U, 1U, 1024U), TensorShape(1U, 1U, 1024U, 1001U, 1U), TensorShape(1001U, 1U), TensorShape(1U, 1U, 1001U), PadStrideInfo(1, 1, 0, 0));
-    }
-};
-
-class LargeLocallyConnectedDataset final : public ConvolutionLayerDataset
-{
-public:
-    LargeLocallyConnectedDataset()
-    {
-        // Batch size 1
-        add_config(TensorShape(23U, 27U, 5U), TensorShape(3U, 1U, 5U, 21U, 297U), TensorShape(21U, 297U), TensorShape(11U, 27U, 21U), PadStrideInfo(2, 1, 0, 0));
-        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 5U, 7U, 16U, 132U), TensorShape(16U, 132U), TensorShape(11U, 12U, 16U), PadStrideInfo(3, 2, 1, 0));
-        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U, 16U, 121U), TensorShape(16U, 121U), TensorShape(11U, 11U, 16U), PadStrideInfo(3, 2, 1, 0));
-        // Batch size 4
-        add_config(TensorShape(23U, 27U, 5U, 4U), TensorShape(3U, 1U, 5U, 21U, 297U), TensorShape(21U, 297U), TensorShape(11U, 27U, 21U, 4U), PadStrideInfo(2, 1, 0, 0));
-        add_config(TensorShape(33U, 27U, 7U, 4U), TensorShape(5U, 5U, 7U, 16U, 132U), TensorShape(16U, 132U), TensorShape(11U, 12U, 16U, 4U), PadStrideInfo(3, 2, 1, 0));
-        add_config(TensorShape(33U, 27U, 7U, 4U), TensorShape(5U, 7U, 7U, 16U, 121U), TensorShape(16U, 121U), TensorShape(11U, 11U, 16U, 4U), PadStrideInfo(3, 2, 1, 0));
-        // Arbitrary batch size
-        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U, 121U), TensorShape(16U, 121U), TensorShape(11U, 11U, 16U, 5U), PadStrideInfo(3, 2, 1, 0));
-    }
-};
-
-} // namespace datasets
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_LOCALLYCONNECTED_DATASET */
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index ccd0756c2d..1f7bdb2232 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -121,6 +121,7 @@ class Small3DShapes final : public ShapeDataset
                      TensorShape{ 2U, 5U, 4U },
 
                      TensorShape{ 7U, 7U, 5U },
+                     TensorShape{ 16U, 16U, 5U },
                      TensorShape{ 27U, 13U, 37U },
     })
     {
@@ -179,6 +180,7 @@ class SmallShapes final : public ShapeDataset
     {
         // Batch size 1
         TensorShape{ 11U, 11U },
+                     TensorShape{ 16U, 16U },
                      TensorShape{ 27U, 13U, 7U },
                      TensorShape{ 31U, 27U, 17U, 2U },
                      // Batch size 4
diff --git a/tests/framework/instruments/SchedulerTimer.cpp b/tests/framework/instruments/SchedulerTimer.cpp
index aa69bc297d..b4d1c597e7 100644
--- a/tests/framework/instruments/SchedulerTimer.cpp
+++ b/tests/framework/instruments/SchedulerTimer.cpp
@@ -26,8 +26,8 @@
 #include "Instruments.h"
 #include "WallClockTimer.h"
 #include "arm_compute/core/CPP/ICPPKernel.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/graph/INode.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
diff --git a/tests/validate_examples/cl_gemm.cpp b/tests/validate_examples/cl_gemm.cpp
index 34895840e1..99f7513624 100644
--- a/tests/validate_examples/cl_gemm.cpp
+++ b/tests/validate_examples/cl_gemm.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,10 +26,26 @@
 #endif /* ARM_COMPUTE_CL */
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/Globals.h"
@@ -59,40 +75,6 @@ RelativeTolerance<float>            tolerance_f32(0.001f);      /**< F32 Toleran
 RelativeTolerance<half_float::half> tolerance_f16(half(0.2));   /**< F16 Tolerance value for comparing reference's output against implementation's output for floating point data types */
 constexpr float                     tolerance_num_f16 = 0.02f;  /**< F16 Tolerance number */
 
-namespace arm_compute
-{
-DataType data_type_from_name(const std::string &name)
-{
-    static const std::map<std::string, DataType> data_types =
-    {
-        { "f16", DataType::F16 },
-        { "f32", DataType::F32 },
-        { "qasymm8", DataType::QASYMM8 },
-    };
-
-#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
-    try
-    {
-#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
-        return data_types.at(utility::tolower(name));
-
-#ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
-    }
-    catch(const std::out_of_range &)
-    {
-        throw std::invalid_argument(name);
-    }
-#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
-}
-
-inline ::std::istream &operator>>(::std::istream &stream, DataType &data_type)
-{
-    std::string value;
-    stream >> value;
-    data_type = data_type_from_name(value);
-    return stream;
-}
-} // namespace arm_compute
 namespace
 {
 class GEMMCommandLineOptions final
diff --git a/tests/validation/CL/AbsoluteDifference.cpp b/tests/validation/CL/AbsoluteDifference.cpp
index b2f0280fdc..8d7d2fd3ce 100644
--- a/tests/validation/CL/AbsoluteDifference.cpp
+++ b/tests/validation/CL/AbsoluteDifference.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,29 +57,6 @@ template <typename T>
 using CLAbsoluteDifferenceFixture = AbsoluteDifferenceValidationFixture<CLTensor, CLAccessor, CLAbsoluteDifference, T>;
 
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AbsoluteDifferenceU8Dataset),
-               shape, data_type0, data_type1, output_data_type)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, data_type0);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, data_type1);
-    CLTensor dst      = create_tensor<CLTensor>(shape, output_data_type);
-
-    // Create and Configure function
-    CLAbsoluteDifference abs_diff;
-    abs_diff.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLAbsoluteDifferenceFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), AbsoluteDifferenceU8Dataset))
 {
     // Validate output
@@ -93,29 +70,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLAbsoluteDifferenceFixture<uint8_t>, framework
 TEST_SUITE_END() // U8
 
 TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AbsoluteDifferenceS16Dataset),
-               shape, data_type0, data_type1, output_data_type)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, data_type0);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, data_type1);
-    CLTensor dst      = create_tensor<CLTensor>(shape, output_data_type);
-
-    // Create and Configure function
-    CLAbsoluteDifference abs_diff;
-    abs_diff.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLAbsoluteDifferenceFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), AbsoluteDifferenceS16Dataset))
 {
     // Validate output
diff --git a/tests/validation/CL/Accumulate.cpp b/tests/validation/CL/Accumulate.cpp
index ee2d2521cb..3e0175ea78 100644
--- a/tests/validation/CL/Accumulate.cpp
+++ b/tests/validation/CL/Accumulate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,27 +53,6 @@ TEST_SUITE(CL)
 TEST_SUITE(Accumulate)
 
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AccumulateS16Dataset),
-               shape, data_type, output_data_type)
-{
-    // Create tensors
-    CLTensor ref_src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst     = create_tensor<CLTensor>(shape, output_data_type);
-
-    // Create and Configure function
-    CLAccumulate accum;
-    accum.configure(&ref_src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T1>
 using CLAccumulateFixture = AccumulateValidationFixture<CLTensor, CLAccessor, CLAccumulate, T1, int16_t>;
 
@@ -94,32 +73,6 @@ TEST_SUITE_END()
 TEST_SUITE(AccumulateWeighted)
 
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AccumulateU8Dataset),
-               shape, data_type, output_data_type)
-{
-    // Generate a random alpha value
-    std::mt19937                     gen(library->seed());
-    std::uniform_real_distribution<> float_dist(0, 1);
-    const float                      alpha = float_dist(gen);
-
-    // Create tensors
-    CLTensor ref_src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst     = create_tensor<CLTensor>(shape, output_data_type);
-
-    // Create and Configure function
-    CLAccumulateWeighted accum_weight;
-    accum_weight.configure(&ref_src, alpha, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T1>
 using CLAccumulateWeightedFixture = AccumulateWeightedValidationFixture<CLTensor, CLAccessor, CLAccumulateWeighted, T1, uint8_t>;
 
@@ -140,32 +93,6 @@ TEST_SUITE_END()
 TEST_SUITE(AccumulateSquared)
 
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AccumulateS16Dataset),
-               shape, data_type, output_data_type)
-{
-    // Generate a random shift value
-    std::mt19937                            gen(library->seed());
-    std::uniform_int_distribution<uint32_t> int_dist(0, 15);
-    const uint32_t                          shift = int_dist(gen);
-
-    // Create tensors
-    CLTensor ref_src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst     = create_tensor<CLTensor>(shape, output_data_type);
-
-    // Create and Configure function
-    CLAccumulateSquared accum_square;
-    accum_square.configure(&ref_src, shift, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T1>
 using CLAccumulateSquaredFixture = AccumulateSquaredValidationFixture<CLTensor, CLAccessor, CLAccumulateSquared, T1, int16_t>;
 
diff --git a/tests/validation/CL/ActivationLayer.cpp b/tests/validation/CL/ActivationLayer.cpp
index 1fef384a8e..9b725a44e7 100644
--- a/tests/validation/CL/ActivationLayer.cpp
+++ b/tests/validation/CL/ActivationLayer.cpp
@@ -89,61 +89,16 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 
 /** Input data sets. */
 const auto ActivationDataset = combine(combine(framework::dataset::make("InPlace", { false, true }), datasets::ActivationFunctions()), framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
+
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(ActivationLayer)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), CNNDataTypes), framework::dataset::make("InPlace", { false, true })),
-               shape, data_type, in_place)
-{
-    // Create context
-    auto ctx = parameters->get_ctx<CLTensor>();
-
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type, 1, QuantizationInfo(), DataLayout::NCHW, ctx);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type, 1, QuantizationInfo(), DataLayout::NCHW, ctx);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLActivationLayer act_layer(ctx);
-
-    if(in_place)
-    {
-        act_layer.configure(&src, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS));
-    }
-    else
-    {
-        act_layer.configure(&src, &dst, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS));
-    }
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-
-    if(!in_place)
-    {
-        validate(dst.info()->valid_region(), valid_region);
-    }
-
-    // Validate padding
-    const int         step    = 16 / arm_compute::data_size_from_type(data_type);
-    const PaddingSize padding = PaddingCalculator(shape.x(), step).required_padding();
-    validate(src.info()->padding(), padding);
-
-    if(!in_place)
-    {
-        validate(dst.info()->padding(), padding);
-    }
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching data types
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Window shrink
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QASYMM8), // Invalid quantization info
@@ -172,11 +127,12 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
                                                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SQRT),
                                                           })),
-               framework::dataset::make("Expected", { false, false, true, true, false, false, true, true, false })),
+               framework::dataset::make("Expected", { false, true, true, true, false, false, true, true, false })),
                input_info, output_info, act_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLActivationLayer::validate(&input_info.clone()->set_is_resizable(false), (output_info.total_size() == 0) ? nullptr : &output_info.clone()->set_is_resizable(false), act_info)) == expected, framework::LogLevel::ERRORS);
 }
+
 // clang-format on
 // *INDENT-ON*
 
diff --git a/tests/validation/CL/ArgMinMax.cpp b/tests/validation/CL/ArgMinMax.cpp
index 7dcd22e795..2508c63524 100644
--- a/tests/validation/CL/ArgMinMax.cpp
+++ b/tests/validation/CL/ArgMinMax.cpp
@@ -22,12 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/datasets/SplitDataset.h"
@@ -54,7 +53,8 @@ const auto ArgMinMaxSmallDataset = framework::dataset::make("Shape",
     TensorShape{ 2560, 2U, 2U, 2U },
 });
 
-const auto ArgMinMaxLargeDataset = framework::dataset::make("Shape", { TensorShape{ 517U, 123U, 13U, 2U } });
+const auto ArgMinMaxLargeDataset = framework::dataset::make("Shape",
+{ TensorShape{ 517U, 123U, 13U, 2U } });
 } // namespace
 TEST_SUITE(CL)
 TEST_SUITE(ArgMinMax)
diff --git a/tests/validation/CL/ArithmeticAddition.cpp b/tests/validation/CL/ArithmeticAddition.cpp
index 93faa7e63a..c74f6a3b23 100644
--- a/tests/validation/CL/ArithmeticAddition.cpp
+++ b/tests/validation/CL/ArithmeticAddition.cpp
@@ -78,23 +78,20 @@ TEST_SUITE(ArithmeticAddition)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),      // Window shrink
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false, false})),
+               framework::dataset::make("Expected", { true, true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLArithmeticAddition::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), ConvertPolicy::WRAP)) == expected, framework::LogLevel::ERRORS);
@@ -172,6 +169,18 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionQuantizedFixture<uint8_t>,
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
+template <typename T>
+using CLArithmeticAdditionBroadcastQuantizedFixture = ArithmeticAdditionValidationQuantizedBroadcastFixture<CLTensor, CLAccessor, CLArithmeticAddition, T>;
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLArithmeticAdditionBroadcastQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                       ArithmeticAdditionQASYMM8Dataset),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
diff --git a/tests/validation/CL/ArithmeticDivision.cpp b/tests/validation/CL/ArithmeticDivision.cpp
index 82a0ec5b6a..36567dc02a 100644
--- a/tests/validation/CL/ArithmeticDivision.cpp
+++ b/tests/validation/CL/ArithmeticDivision.cpp
@@ -46,7 +46,6 @@ namespace
 RelativeTolerance<float> tolerance_fp32(0.000001f);
 RelativeTolerance<float> tolerance_fp16(0.001f);
 
-constexpr unsigned int num_elems_processed_per_iteration = 16;
 /** Input data sets **/
 const auto ArithmeticDivisionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
                                                    framework::dataset::make("DataType", DataType::F16));
@@ -69,23 +68,20 @@ TEST_SUITE(ArithmeticDivision)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),      // Window shrink
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, false, false, false, false})),
+               framework::dataset::make("Expected", { true, false, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLArithmeticDivision::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
@@ -113,29 +109,6 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticDivisionFloatFixture<half>
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    CLArithmeticDivision add;
-    add.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticDivisionFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticDivisionFP32Dataset),
                                                                                                                      EmptyActivationFunctionsDataset))
 {
diff --git a/tests/validation/CL/ArithmeticSubtraction.cpp b/tests/validation/CL/ArithmeticSubtraction.cpp
index 52d787febb..2709fcaedb 100644
--- a/tests/validation/CL/ArithmeticSubtraction.cpp
+++ b/tests/validation/CL/ArithmeticSubtraction.cpp
@@ -43,7 +43,6 @@ namespace validation
 {
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 16;
 /** Input data sets **/
 const auto ArithmeticSubtractionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)),
                                                     framework::dataset::make("DataType",
@@ -82,23 +81,20 @@ TEST_SUITE(ArithmeticSubtraction)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),      // Window shrink
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false, false})),
+               framework::dataset::make("Expected", { true, true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLArithmeticSubtraction::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), ConvertPolicy::WRAP)) == expected, framework::LogLevel::ERRORS);
@@ -163,32 +159,9 @@ using CLArithmeticSubtractionFixture = ArithmeticSubtractionValidationFixture<CL
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-               shape, policy)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::U8);
-
-    // Create and Configure function
-    CLArithmeticSubtraction add;
-    add.configure(&ref_src1, &ref_src2, &dst, policy);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionU8Dataset),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                     OutOfPlaceDataSet))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionU8Dataset),
+                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                               OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -196,30 +169,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<uint8_t>, framew
 TEST_SUITE_END() // U8
 
 TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-               shape, data_type, policy)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::S16);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::S16);
-
-    // Create and Configure function
-    CLArithmeticSubtraction add;
-    add.configure(&ref_src1, &ref_src2, &dst, policy);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionS16Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                      OutOfPlaceDataSet))
@@ -243,29 +192,6 @@ using CLArithmeticSubtractionQuantizedFixture = ArithmeticSubtractionValidationQ
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-               shape, policy)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-
-    // Create and Configure function
-    CLArithmeticSubtraction add;
-    add.configure(&ref_src1, &ref_src2, &dst, policy);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ArithmeticSubtractionQASYMM8Dataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
@@ -279,29 +205,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<uint8_t
 }
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-               shape, policy)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-
-    // Create and Configure function
-    CLArithmeticSubtraction sub;
-    sub.configure(&ref_src1, &ref_src2, &dst, policy);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ArithmeticSubtractionQASYMM8SignedDataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
@@ -315,29 +218,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<int8_t>
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE(QSYMM16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-               shape, policy)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-
-    // Create and Configure function
-    CLArithmeticSubtraction sub;
-    sub.configure(&ref_src1, &ref_src2, &dst, policy);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ArithmeticSubtractionQSYMM16Dataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
@@ -377,29 +257,6 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticSubtractionFloatFixture<ha
 TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-               shape, policy)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    CLArithmeticSubtraction add;
-    add.configure(&ref_src1, &ref_src2, &dst, policy);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                         ArithmeticSubtractionFP32Dataset),
                                                                                                                         framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
diff --git a/tests/validation/CL/BatchConcatenateLayer.cpp b/tests/validation/CL/BatchConcatenateLayer.cpp
index 6fd189b716..522a6ab8ee 100644
--- a/tests/validation/CL/BatchConcatenateLayer.cpp
+++ b/tests/validation/CL/BatchConcatenateLayer.cpp
@@ -80,30 +80,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
     bool is_valid = bool(CLConcatenateLayer::validate(inputs_vector_info_raw, &output_info.clone()->set_is_resizable(false), 3));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
+
 // clang-format on
 // *INDENT-ON*
 
-TEST_CASE(Configuration, framework::DatasetMode::ALL)
-{
-    // Create tensors
-    CLTensor src1 = create_tensor<CLTensor>(TensorShape(128U, 32U, 32U), DataType::F32, 1);
-    CLTensor src2 = create_tensor<CLTensor>(TensorShape(128U, 32U, 32U), DataType::F32, 1);
-    CLTensor src3 = create_tensor<CLTensor>(TensorShape(128U, 32U, 32U), DataType::F32, 1);
-    CLTensor dst;
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src3.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLConcatenateLayer             concat_layer;
-    std::vector<const ICLTensor *> inputs;
-    inputs.emplace_back(&src1);
-    inputs.emplace_back(&src2);
-    inputs.emplace_back(&src3);
-    concat_layer.configure(inputs, &dst, 3);
-}
 template <typename T>
 using CLBatchConcatenateLayerFixture = ConcatenateLayerValidationFixture<CLTensor, ICLTensor, CLAccessor, CLConcatenateLayer, T>;
 
diff --git a/tests/validation/CL/BatchNormalizationLayer.cpp b/tests/validation/CL/BatchNormalizationLayer.cpp
index cb17204279..8b3bdbc3ea 100644
--- a/tests/validation/CL/BatchNormalizationLayer.cpp
+++ b/tests/validation/CL/BatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,10 +58,12 @@ const auto                         act_infos = framework::dataset::make("Activat
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 8.f, 2.f),
 });
 
-const auto common_fusion_dataset = combine(combine(combine(framework::dataset::make("UseBias", { false, true }),
-                                                           framework::dataset::make("UseBeta", { false, true })),
-                                                   framework::dataset::make("UseGamma", { false, true })),
-                                           framework::dataset::make("Epsilon", { 0.001f }));
+const auto common_fusion_dataset = combine(combine(combine(framework::dataset::make("UseBias",
+{ false, true }),
+framework::dataset::make("UseBeta", { false, true })),
+framework::dataset::make("UseGamma", { false, true })),
+framework::dataset::make("Epsilon", { 0.001f }));
+
 } // namespace
 
 TEST_SUITE(CL)
@@ -70,38 +72,6 @@ TEST_SUITE(BatchNormalizationLayer)
 template <typename T>
 using CLBatchNormalizationLayerFixture = BatchNormalizationLayerValidationFixture<CLTensor, CLAccessor, CLBatchNormalizationLayer, T>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallRandomBatchNormalizationLayerDataset(),
-                                                                                   combine(framework::dataset::make("UseBeta", { false, true }),
-                                                                                           framework::dataset::make("UseGamma", { false, true }))),
-                                                                           framework::dataset::make("DataType", { DataType::F16, DataType::F32 })),
-                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-               shape0, shape1, epsilon, use_gamma, use_beta, dt, data_layout)
-{
-    TensorShape src_dst_shapes = shape0;
-    if(data_layout == DataLayout::NHWC)
-    {
-        permute(src_dst_shapes, PermutationVector(2U, 0U, 1U));
-    }
-
-    // Create tensors
-    CLTensor src   = create_tensor<CLTensor>(src_dst_shapes, dt, 1, QuantizationInfo(), data_layout);
-    CLTensor dst   = create_tensor<CLTensor>(src_dst_shapes, dt, 1, QuantizationInfo(), data_layout);
-    CLTensor mean  = create_tensor<CLTensor>(shape1, dt, 1);
-    CLTensor var   = create_tensor<CLTensor>(shape1, dt, 1);
-    CLTensor beta  = create_tensor<CLTensor>(shape1, dt, 1);
-    CLTensor gamma = create_tensor<CLTensor>(shape1, dt, 1);
-
-    // Create and Configure function
-    CLBatchNormalizationLayer norm;
-    CLTensor                 *beta_ptr  = use_beta ? &beta : nullptr;
-    CLTensor                 *gamma_ptr = use_gamma ? &gamma : nullptr;
-    norm.configure(&src, &dst, &mean, &var, beta_ptr, gamma_ptr, epsilon);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(src_dst_shapes);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
@@ -153,8 +123,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(Random, CLBatchNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallRandomBatchNormalizationLayerDataset(),
-                                                                                                                   combine(framework::dataset::make("UseBeta", { false, true }),
-                                                                                                                           framework::dataset::make("UseGamma", { false, true }))),
+                                                                                                                   combine(framework::dataset::make("UseBeta", { false, true }), framework::dataset::make("UseGamma", { false, true }))),
                                                                                                                    act_infos),
                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
@@ -166,9 +135,9 @@ TEST_SUITE_END() //FP32
 
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(Random, CLBatchNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallRandomBatchNormalizationLayerDataset(),
-                                                                                                                  combine(framework::dataset::make("UseBeta", { false, true }),
-                                                                                                                          framework::dataset::make("UseGamma", { false, true }))),
-                                                                                                                  framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))),
+                                                                                                                  combine(framework::dataset::make("UseBeta", { false, true }), framework::dataset::make("UseGamma", { false, true }))),
+                                                                                                                  framework::dataset::make("ActivationInfo",
+                                                                                                                          ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))),
                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
diff --git a/tests/validation/CL/BatchToSpaceLayer.cpp b/tests/validation/CL/BatchToSpaceLayer.cpp
index f553787729..e90ac921c5 100644
--- a/tests/validation/CL/BatchToSpaceLayer.cpp
+++ b/tests/validation/CL/BatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/BitwiseAnd.cpp b/tests/validation/CL/BitwiseAnd.cpp
index 76db5bb627..4fc3d9ca87 100644
--- a/tests/validation/CL/BitwiseAnd.cpp
+++ b/tests/validation/CL/BitwiseAnd.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,34 +43,6 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(BitwiseAnd)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    CLTensor src1 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor src2 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst  = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLBitwiseAnd bitwise_and;
-    bitwise_and.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src1.info()->valid_region(), valid_region);
-    validate(src2.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using CLBitwiseAndFixture = BitwiseAndValidationFixture<CLTensor, CLAccessor, CLBitwiseAnd, T>;
 
diff --git a/tests/validation/CL/BitwiseNot.cpp b/tests/validation/CL/BitwiseNot.cpp
index d3b1c5ddfd..dc884873cc 100644
--- a/tests/validation/CL/BitwiseNot.cpp
+++ b/tests/validation/CL/BitwiseNot.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,30 +43,6 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(BitwiseNot)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLBitwiseNot bitwise_not;
-    bitwise_not.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using CLBitwiseNotFixture = BitwiseNotValidationFixture<CLTensor, CLAccessor, CLBitwiseNot, T>;
 
diff --git a/tests/validation/CL/BitwiseOr.cpp b/tests/validation/CL/BitwiseOr.cpp
index 585170fc31..5a483ad7bf 100644
--- a/tests/validation/CL/BitwiseOr.cpp
+++ b/tests/validation/CL/BitwiseOr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,34 +43,6 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(BitwiseOr)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    CLTensor src1 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor src2 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst  = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLBitwiseOr bitwise_or;
-    bitwise_or.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src1.info()->valid_region(), valid_region);
-    validate(src2.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using CLBitwiseOrFixture = BitwiseOrValidationFixture<CLTensor, CLAccessor, CLBitwiseOr, T>;
 
diff --git a/tests/validation/CL/BitwiseXor.cpp b/tests/validation/CL/BitwiseXor.cpp
index bd7ff5ba43..607cdbe574 100644
--- a/tests/validation/CL/BitwiseXor.cpp
+++ b/tests/validation/CL/BitwiseXor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,34 +43,6 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(BitwiseXor)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    CLTensor src1 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor src2 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst  = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLBitwiseXor bitwise_xor;
-    bitwise_xor.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src1.info()->valid_region(), valid_region);
-    validate(src2.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using CLBitwiseXorFixture = BitwiseXorValidationFixture<CLTensor, CLAccessor, CLBitwiseXor, T>;
 
diff --git a/tests/validation/CL/BoundingBoxTransform.cpp b/tests/validation/CL/BoundingBoxTransform.cpp
index 82dfa31606..2a7f1667d6 100644
--- a/tests/validation/CL/BoundingBoxTransform.cpp
+++ b/tests/validation/CL/BoundingBoxTransform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/Box3x3.cpp b/tests/validation/CL/Box3x3.cpp
index 8d79189f4c..f4eff823cf 100644
--- a/tests/validation/CL/Box3x3.cpp
+++ b/tests/validation/CL/Box3x3.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,41 +50,6 @@ constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kerne
 TEST_SUITE(CL)
 TEST_SUITE(Box3x3)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLBox3x3 box3x3;
-    box3x3.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using CLBox3x3Fixture = Box3x3ValidationFixture<CLTensor, CLAccessor, CLBox3x3, T>;
 
diff --git a/tests/validation/CL/CannyEdge.cpp b/tests/validation/CL/CannyEdge.cpp
index f8cf9f0654..10da5f4363 100644
--- a/tests/validation/CL/CannyEdge.cpp
+++ b/tests/validation/CL/CannyEdge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,58 +48,14 @@ namespace
 /* Allowed ratio of mismatches between target and reference (1.0 = 100%) */
 const float allowed_mismatch_ratio = 0.1f;
 
-const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }),
-                          combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }), datasets::BorderModes()));
+const auto data = combine(framework::dataset::make("GradientSize",
+{ 3, 5, 7 }),
+combine(framework::dataset::make("Normalization", { MagnitudeType::L1NORM, MagnitudeType::L2NORM }), datasets::BorderModes()));
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(CannyEdge)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), data), framework::dataset::make("Format", Format::U8)),
-               shape, gradient_size, normalization, border_mode, format)
-{
-    CannyEdgeParameters params = canny_edge_parameters();
-    // Convert normalisation type to integer
-    const auto norm_type = static_cast<int>(normalization) + 1;
-
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type_from_format(format));
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type_from_format(format));
-    src.info()->set_format(format);
-    dst.info()->set_format(format);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create Canny edge configure function
-    CLCannyEdge canny_edge;
-    canny_edge.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value);
-
-    // Validate valid region
-    validate(src.info()->valid_region(), shape_to_valid_region(shape, (BorderMode::UNDEFINED == border_mode)));
-
-    //TODO(COMPMID-568): dst region validation fails when Shape=7x7 and GradientSize=7 and BorderMode=UNDEFINED (integer underflow)
-    if(!(shape == TensorShape{ 7u, 7u } && gradient_size == 7 && border_mode == BorderMode::UNDEFINED))
-    {
-        validate(dst.info()->valid_region(), shape_to_valid_region(shape, (BorderMode::UNDEFINED == border_mode), BorderSize(gradient_size / 2 + 1)));
-    }
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 1);
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(1);
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_border_size(gradient_size / 2);
-    calculator.set_access_offset(-gradient_size / 2);
-    calculator.set_accessed_elements(16);
-    calculator.set_processed_elements(8);
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using CLCannyEdgeFixture = CannyEdgeValidationFixture<CLTensor, CLAccessor, CLKeyPointArray, CLCannyEdge, T>;
 
diff --git a/tests/validation/CL/Cast.cpp b/tests/validation/CL/Cast.cpp
index a283aec226..2ca8b58040 100644
--- a/tests/validation/CL/Cast.cpp
+++ b/tests/validation/CL/Cast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -18,7 +18,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONCLCTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
@@ -142,22 +142,6 @@ using CLCastToF32Fixture = CastValidationFixture<CLTensor, CLAccessor, CLCast, T
 
 #define CAST_SUITE(NAME, idt, odt, type, dataset, tolerance)                                                                     \
     TEST_SUITE(NAME)                                                                                                             \
-    DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), datasets::ConvertPolicies()),    \
-                   shape, policy)                                                                                                \
-    {                                                                                                                            \
-        CLTensor src = create_tensor<CLTensor>(shape, idt, 1);                                                                   \
-        CLTensor dst = create_tensor<CLTensor>(shape, odt, 1);                                                                   \
-        \
-        CLCast cast;                                                                                                             \
-        cast.configure(&src, &dst, policy);                                                                                      \
-        \
-        const ValidRegion valid_region = shape_to_valid_region(shape);                                                           \
-        validate(dst.info()->valid_region(), valid_region);                                                                      \
-        \
-        const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();                                         \
-        validate(src.info()->padding(), padding);                                                                                \
-        validate(dst.info()->padding(), padding);                                                                                \
-    }                                                                                                                            \
     FIXTURE_DATA_TEST_CASE(RunSmall, type, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), dataset), \
                                                                                       datasets::ConvertPolicies()))              \
     {                                                                                                                            \
diff --git a/tests/validation/CL/ChannelCombine.cpp b/tests/validation/CL/ChannelCombine.cpp
index 6187e72960..7ef8414d7e 100644
--- a/tests/validation/CL/ChannelCombine.cpp
+++ b/tests/validation/CL/ChannelCombine.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/ChannelExtract.cpp b/tests/validation/CL/ChannelExtract.cpp
index 7657d5a7ea..7a0dcf3e7d 100644
--- a/tests/validation/CL/ChannelExtract.cpp
+++ b/tests/validation/CL/ChannelExtract.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/ChannelShuffle.cpp b/tests/validation/CL/ChannelShuffle.cpp
index 8c06e6b6d4..2061570a11 100644
--- a/tests/validation/CL/ChannelShuffle.cpp
+++ b/tests/validation/CL/ChannelShuffle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,22 +70,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallRandomChannelShuffleLayerDataset(), framework::dataset::make("DataType", { DataType::S8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32 })),
-               shape, num_groups, data_type)
-{
-    // Create tensors
-    CLTensor ref_src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst     = create_tensor<CLTensor>(shape, data_type);
-
-    // Create and Configure function
-    CLChannelShuffleLayer channel_shuffle_func;
-    channel_shuffle_func.configure(&ref_src, &dst, num_groups);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using CLChannelShuffleLayerFixture = ChannelShuffleLayerValidationFixture<CLTensor, CLAccessor, CLChannelShuffleLayer, T>;
 
diff --git a/tests/validation/CL/Col2Im.cpp b/tests/validation/CL/Col2Im.cpp
index d6ef010b53..b651bf8918 100644
--- a/tests/validation/CL/Col2Im.cpp
+++ b/tests/validation/CL/Col2Im.cpp
@@ -21,9 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
 #include "arm_compute/core/Types.h"
-
+#include "src/core/CL/kernels/CLCol2ImKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/framework/Asserts.h"
diff --git a/tests/validation/CL/ColorConvert.cpp b/tests/validation/CL/ColorConvert.cpp
index 37957cd95e..e81875a694 100644
--- a/tests/validation/CL/ColorConvert.cpp
+++ b/tests/validation/CL/ColorConvert.cpp
@@ -71,61 +71,6 @@ const auto ColorConvert_YUYVDataset_to_NVDataset = combine(YUYVDataset,
 const auto ColorConvert_NVDataset_to_YUVDataset = combine(framework::dataset::make("FormatType", { Format::NV12, Format::NV21 }),
                                                           framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444 }));
 
-inline void validate_configuration(const TensorShape &shape, Format src_format, Format dst_format)
-{
-    const unsigned int src_num_planes = num_planes_from_format(src_format);
-    const unsigned int dst_num_planes = num_planes_from_format(dst_format);
-
-    TensorShape input = adjust_odd_shape(shape, src_format);
-    input             = adjust_odd_shape(input, src_format);
-
-    // Create tensors
-    CLMultiImage ref_src = create_multi_image<CLMultiImage>(input, src_format);
-    CLMultiImage ref_dst = create_multi_image<CLMultiImage>(input, dst_format);
-
-    // Create and Configure function
-    CLColorConvert color_convert;
-
-    if(1U == src_num_planes)
-    {
-        const CLTensor *src_plane = ref_src.cl_plane(0);
-
-        if(1U == dst_num_planes)
-        {
-            CLTensor *dst_plane = ref_dst.cl_plane(0);
-            color_convert.configure(src_plane, dst_plane);
-        }
-        else
-        {
-            color_convert.configure(src_plane, &ref_dst);
-        }
-    }
-    else
-    {
-        if(1U == dst_num_planes)
-        {
-            CLTensor *dst_plane = ref_dst.cl_plane(0);
-            color_convert.configure(&ref_src, dst_plane);
-        }
-        else
-        {
-            color_convert.configure(&ref_src, &ref_dst);
-        }
-    }
-
-    for(unsigned int plane_idx = 0; plane_idx < src_num_planes; ++plane_idx)
-    {
-        const CLTensor *src_plane = ref_src.cl_plane(plane_idx);
-
-        ARM_COMPUTE_EXPECT(src_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-    for(unsigned int plane_idx = 0; plane_idx < dst_num_planes; ++plane_idx)
-    {
-        const CLTensor *dst_plane = ref_dst.cl_plane(plane_idx);
-
-        ARM_COMPUTE_EXPECT(dst_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-}
 } // namespace
 
 TEST_SUITE(CL)
@@ -134,56 +79,6 @@ TEST_SUITE(ColorConvert)
 template <typename T>
 using CLColorConvertFixture = ColorConvertValidationFixture<CLMultiImage, CLTensor, CLAccessor, CLColorConvert, T>;
 
-TEST_SUITE(Configuration)
-DATA_TEST_CASE(RGBA, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_RGBA_to_RGB),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(RGB, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_RGB_to_RGBA),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(RGBtoU8, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_RGB_to_U8),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(YUV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_YUYV_to_RGBDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(YUVPlanar, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_YUVPlanar_to_RGBDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(NV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_RGBDataset_to_NVDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(YUYVtoNV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_YUYVDataset_to_NVDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(NVtoYUV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_NVDataset_to_YUVDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-TEST_SUITE_END()
-
 TEST_SUITE(RGBA)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLColorConvertFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ColorConvert_RGBA_to_RGB))
 {
diff --git a/tests/validation/CL/Comparisons.cpp b/tests/validation/CL/Comparisons.cpp
index fb8935ba80..d015528b0e 100644
--- a/tests/validation/CL/Comparisons.cpp
+++ b/tests/validation/CL/Comparisons.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -87,31 +87,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, configure_dataset,
-               shape, data_type)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::U8);
-
-    // Create and Configure function
-    CLComparison compare;
-    compare.configure(&ref_src1, &ref_src2, &dst, ComparisonOperation::Equal);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    const int num_elems_processed_per_iteration = 16 / ref_src1.info()->element_size();
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using CLComparisonFixture = ComparisonValidationFixture<CLTensor, CLAccessor, CLComparison, T>;
 
diff --git a/tests/validation/CL/ConvertFullyConnectedWeights.cpp b/tests/validation/CL/ConvertFullyConnectedWeights.cpp
index a5065fb217..70d7b2c767 100644
--- a/tests/validation/CL/ConvertFullyConnectedWeights.cpp
+++ b/tests/validation/CL/ConvertFullyConnectedWeights.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/Convolution.cpp b/tests/validation/CL/Convolution.cpp
index 8804d34123..1608e7c66d 100644
--- a/tests/validation/CL/Convolution.cpp
+++ b/tests/validation/CL/Convolution.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,45 +44,6 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(CustomConvolution)
 TEST_SUITE(Square3x3)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                               datasets::BorderModes()),
-                                                                       framework::dataset::make("filter_size", { 3 })),
-               shape, output_data_type, border_mode, filter_size)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor dst = create_tensor<CLTensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::array<int16_t, 9> conv = { 0 };
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLConvolution3x3 convolution;
-    convolution.configure(&src, &dst, conv.data(), 0, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using CLConvolutionFixture = ConvolutionSquareValidationFixture<CLTensor, CLAccessor, CLConvolution3x3, T>;
 
@@ -112,45 +73,6 @@ TEST_SUITE_END() // S16
 TEST_SUITE_END() // Square 3x3
 
 TEST_SUITE(Square5x5)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                               datasets::BorderModes()),
-                                                                       framework::dataset::make("filter_size", { 5 })),
-               shape, output_data_type, border_mode, filter_size)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor dst = create_tensor<CLTensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::array<int16_t, 25> conv = { 0 };
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLConvolution5x5 convolution;
-    convolution.configure(&src, &dst, conv.data(), 0, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(2);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-2);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using CLConvolutionFixture = ConvolutionSquareValidationFixture<CLTensor, CLAccessor, CLConvolution5x5, T>;
 
@@ -180,45 +102,6 @@ TEST_SUITE_END() // S16
 TEST_SUITE_END() // Square5x5
 
 TEST_SUITE(Square7x7)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                               datasets::BorderModes()),
-                                                                       framework::dataset::make("filter_size", { 7 })),
-               shape, output_data_type, border_mode, filter_size)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor dst = create_tensor<CLTensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::array<int16_t, 49> conv = { 0 };
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLConvolution7x7 convolution;
-    convolution.configure(&src, &dst, conv.data(), 0, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(3);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-3);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using CLConvolutionFixture = ConvolutionSquareValidationFixture<CLTensor, CLAccessor, CLConvolution7x7, T>;
 
@@ -248,44 +131,6 @@ TEST_SUITE_END() // S16
 TEST_SUITE_END() // Square7x7
 
 TEST_SUITE(Square9x9)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                               datasets::BorderModes()),
-                                                                       framework::dataset::make("filter_size", { 9 })),
-               shape, output_data_type, border_mode, filter_size)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor dst = create_tensor<CLTensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::array<int16_t, 81> conv = { 0 };
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLConvolution9x9 convolution;
-    convolution.configure(&src, &dst, conv.data(), 0, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(4);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-4);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
 
 template <typename T>
 using CLConvolutionFixture = ConvolutionSquareValidationFixture<CLTensor, CLAccessor, CLConvolution9x9, T>;
@@ -316,51 +161,6 @@ TEST_SUITE_END() // S16
 TEST_SUITE_END() // Square9x9
 
 TEST_SUITE(Rectangle)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-{ DataType::U8, DataType::S16 })),
-datasets::BorderModes()),
-framework::dataset::make("filter_width", { 3, 5, 7, 9 })),
-framework::dataset::make("filter_height", { 3, 5, 7, 9 })),
-shape, output_data_type, border_mode, filter_width, filter_height)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor dst = create_tensor<CLTensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::vector<int16_t> conv(filter_width * filter_height);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLConvolutionRectangle convolution;
-    convolution.configure(&src, &dst, conv.data(), filter_width, filter_height, 1, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_height / 2, filter_width / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(filter_width / 2);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-(filter_width / 2));
-
-    const PaddingSize width_padding = calculator.required_padding();
-
-    calculator.set_border_size(filter_height / 2);
-    calculator.set_access_offset(-(filter_height / 2));
-    const PaddingSize height_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), width_padding, height_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using CLConvolutionFixture = ConvolutionRectangleValidationFixture<CLTensor, CLAccessor, CLConvolutionRectangle, T>;
 
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index 8c40b7e366..b66cfd97e7 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -45,6 +45,16 @@ namespace validation
 {
 namespace
 {
+class SmallConvolutionLayerDatasetCases final : public datasets::ConvolutionLayerDataset
+{
+public:
+    SmallConvolutionLayerDatasetCases()
+    {
+        // 1D Kernel
+        add_config(TensorShape(1U, 130U, 2000U), TensorShape(1U, 1U, 2000U, 2000U), TensorShape(2000U), TensorShape(1U, 130U, 2000U), PadStrideInfo(1, 1, 0, 0));
+    }
+};
+
 RelativeTolerance<float>            tolerance_f32(0.1f);                  /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
 constexpr AbsoluteTolerance<float>  tolerance_qasymm8(1);                 /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
@@ -234,6 +244,18 @@ const auto QuantizationData = framework::dataset::make("QuantizationInfo",
 });
 TEST_SUITE(QASYMM8)
 
+FIXTURE_DATA_TEST_CASE(RunSmallCases, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(SmallConvolutionLayerDatasetCases(),
+                                                               framework::dataset::make("ReshapeWeights", { true })),
+                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       QuantizationData),
+                               QuantizedActivationFunctionsSmallDataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                                                                framework::dataset::make("ReshapeWeights", { true })),
diff --git a/tests/validation/CL/Copy.cpp b/tests/validation/CL/Copy.cpp
index 07af24352e..0b2a15146b 100644
--- a/tests/validation/CL/Copy.cpp
+++ b/tests/validation/CL/Copy.cpp
@@ -48,15 +48,13 @@ TEST_SUITE(Copy)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),  // Invalid data type combination
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),  // Mismatching shapes
-                                                       TensorInfo(TensorShape(14U, 13U, 2U), 1, DataType::U8),  // Window shrink
-                                                       TensorInfo(TensorShape(32U, 32U, 2U), 1, DataType::U8),
+                                                       TensorInfo(TensorShape(14U, 13U, 2U), 1, DataType::U8),
                                                      }),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 11U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(14U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(32U, 32U, 2U), 1, DataType::U8),
                                                      })),
-               framework::dataset::make("Expected", { false, false, false, true })),
+               framework::dataset::make("Expected", { false, false, true })),
                input_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLCopy::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
diff --git a/tests/validation/CL/CropResize.cpp b/tests/validation/CL/CropResize.cpp
index 636db1728f..f1fae3d5cc 100644
--- a/tests/validation/CL/CropResize.cpp
+++ b/tests/validation/CL/CropResize.cpp
@@ -25,7 +25,6 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLCropResize.h"
-
 #include "tests/CL/CLAccessor.h"
 #include "tests/datasets/CropResizeDataset.h"
 #include "tests/framework/Asserts.h"
diff --git a/tests/validation/CL/DeconvolutionLayer.cpp b/tests/validation/CL/DeconvolutionLayer.cpp
index c677f5ae96..c284cdcee3 100644
--- a/tests/validation/CL/DeconvolutionLayer.cpp
+++ b/tests/validation/CL/DeconvolutionLayer.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
diff --git a/tests/validation/CL/DepthConcatenateLayer.cpp b/tests/validation/CL/DepthConcatenateLayer.cpp
index 4f5bd118bf..621de279d3 100644
--- a/tests/validation/CL/DepthConcatenateLayer.cpp
+++ b/tests/validation/CL/DepthConcatenateLayer.cpp
@@ -80,27 +80,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-TEST_CASE(Configuration, framework::DatasetMode::ALL)
-{
-    // Create tensors
-    CLTensor src1 = create_tensor<CLTensor>(TensorShape(128U, 32U, 32U), DataType::F32, 1);
-    CLTensor src2 = create_tensor<CLTensor>(TensorShape(128U, 32U, 32U), DataType::F32, 1);
-    CLTensor src3 = create_tensor<CLTensor>(TensorShape(128U, 32U, 32U), DataType::F32, 1);
-    CLTensor dst;
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src3.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLConcatenateLayer             concat_layer;
-    std::vector<const ICLTensor *> inputs;
-    inputs.emplace_back(&src1);
-    inputs.emplace_back(&src2);
-    inputs.emplace_back(&src3);
-    concat_layer.configure(inputs, &dst, 2);
-}
 template <typename T>
 using CLDepthConcatenateLayerFixture = ConcatenateLayerValidationFixture<CLTensor, ICLTensor, CLAccessor, CLConcatenateLayer, T>;
 
diff --git a/tests/validation/CL/DepthConvertLayer.cpp b/tests/validation/CL/DepthConvertLayer.cpp
index c6595e4a61..a823b278fc 100644
--- a/tests/validation/CL/DepthConvertLayer.cpp
+++ b/tests/validation/CL/DepthConvertLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -109,28 +109,6 @@ template <typename T>
 using CLDepthConvertLayerToF32Fixture = DepthConvertLayerValidationFixture<CLTensor, CLAccessor, CLDepthConvertLayer, T, float>;
 
 TEST_SUITE(U8_to_U16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                   DepthConvertLayerShiftDatasetNightly),
-               shape, policy, shift)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U8, 1);
-    CLTensor dst = create_tensor<CLTensor>(shape, DataType::U16, 1);
-
-    // Create and Configure function
-    CLDepthConvertLayer depth_convert;
-    depth_convert.configure(&src, &dst, policy, shift);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToU16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toU16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                       DepthConvertLayerShiftDatasetPrecommit))
@@ -149,28 +127,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToU16Fixture<uint8_t>, frame
 TEST_SUITE_END() // U8_to_U16
 
 TEST_SUITE(U8_to_S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                   DepthConvertLayerShiftDatasetNightly),
-               shape, policy, shift)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U8, 1);
-    CLTensor dst = create_tensor<CLTensor>(shape, DataType::S16, 1);
-
-    // Create and Configure function
-    CLDepthConvertLayer depth_convert;
-    depth_convert.configure(&src, &dst, policy, shift);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToS16Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toS16Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                       DepthConvertLayerShiftDatasetPrecommit))
@@ -188,28 +144,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToS16Fixture<uint8_t>, frame
 }
 TEST_SUITE_END() // U8_to_S16
 TEST_SUITE(U8_to_S32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                   DepthConvertLayerShiftDatasetNightly),
-               shape, policy, shift)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U8, 1);
-    CLTensor dst = create_tensor<CLTensor>(shape, DataType::S32, 1);
-
-    // Create and Configure function
-    CLDepthConvertLayer depth_convert;
-    depth_convert.configure(&src, &dst, policy, shift);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToS32Fixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU8toS32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                       DepthConvertLayerShiftDatasetPrecommit))
@@ -228,28 +162,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToS32Fixture<uint8_t>, frame
 TEST_SUITE_END() // U8_to_S32
 
 TEST_SUITE(U16_to_U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                   DepthConvertLayerShiftDatasetNightly),
-               shape, policy, shift)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U16, 1);
-    CLTensor dst = create_tensor<CLTensor>(shape, DataType::U8, 1);
-
-    // Create and Configure function
-    CLDepthConvertLayer depth_convert;
-    depth_convert.configure(&src, &dst, policy, shift);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToU8Fixture<uint16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU16toU8Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                       DepthConvertLayerShiftDatasetPrecommit))
@@ -267,28 +179,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToU8Fixture<uint16_t>, frame
 TEST_SUITE_END() // U16_to_U8
 
 TEST_SUITE(U16_to_U32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                   DepthConvertLayerShiftDatasetNightly),
-               shape, policy, shift)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U16, 1);
-    CLTensor dst = create_tensor<CLTensor>(shape, DataType::U32, 1);
-
-    // Create and Configure function
-    CLDepthConvertLayer depth_convert;
-    depth_convert.configure(&src, &dst, policy, shift);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToU32Fixture<uint16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerU16toU32Dataset),
                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                        DepthConvertLayerShiftDatasetPrecommit))
@@ -306,28 +196,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToU32Fixture<uint16_t>, fram
 TEST_SUITE_END() // U16_to_U32
 
 TEST_SUITE(S16_to_U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                   DepthConvertLayerShiftDatasetNightly),
-               shape, policy, shift)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::S16, 1);
-    CLTensor dst = create_tensor<CLTensor>(shape, DataType::U8, 1);
-
-    // Create and Configure function
-    CLDepthConvertLayer depth_convert;
-    depth_convert.configure(&src, &dst, policy, shift);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToU8Fixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerS16toU8Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                      DepthConvertLayerShiftDatasetPrecommit))
@@ -345,28 +213,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConvertLayerToU8Fixture<int16_t>, framew
 TEST_SUITE_END() // S16_to_U8
 
 TEST_SUITE(S16_to_S32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                   DepthConvertLayerShiftDatasetNightly),
-               shape, policy, shift)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::S16, 1);
-    CLTensor dst = create_tensor<CLTensor>(shape, DataType::S32, 1);
-
-    // Create and Configure function
-    CLDepthConvertLayer depth_convert;
-    depth_convert.configure(&src, &dst, policy, shift);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConvertLayerToS32Fixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), DepthConvertLayerS16toS32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                       DepthConvertLayerShiftDatasetPrecommit))
diff --git a/tests/validation/CL/DepthToSpaceLayer.cpp b/tests/validation/CL/DepthToSpaceLayer.cpp
index fd570ad753..7cee4b7129 100644
--- a/tests/validation/CL/DepthToSpaceLayer.cpp
+++ b/tests/validation/CL/DepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index c779092eec..351819ae55 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -55,7 +55,8 @@ const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier",
 const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
 {
     ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f, 0.f)
 });
 } // namespace
 
diff --git a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
index 058d9b3ecc..b1cd379574 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
diff --git a/tests/validation/CL/DequantizationLayer.cpp b/tests/validation/CL/DequantizationLayer.cpp
index fa283c9b5b..cff9659ce4 100644
--- a/tests/validation/CL/DequantizationLayer.cpp
+++ b/tests/validation/CL/DequantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -94,32 +94,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })),
-               shape, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::QASYMM8, 1, QuantizationInfo(0.5f, -10));
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLDequantizationLayer dequant_layer;
-    dequant_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 template <typename T>
 using CLDequantizationLayerFixture = DequantizationValidationFixture<CLTensor, CLAccessor, CLDequantizationLayer, T>;
 
diff --git a/tests/validation/CL/Derivative.cpp b/tests/validation/CL/Derivative.cpp
index 284da206c7..8f351e14e3 100644
--- a/tests/validation/CL/Derivative.cpp
+++ b/tests/validation/CL/Derivative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,42 +44,6 @@ TEST_SUITE(Derivative)
 
 using CLDerivativeFixture = DerivativeValidationFixture<CLTensor, CLAccessor, CLDerivative, uint8_t, int16_t>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    CLTensor src   = create_tensor<CLTensor>(shape, data_type_from_format(format));
-    CLTensor dst_x = create_tensor<CLTensor>(shape, DataType::S16);
-    CLTensor dst_y = create_tensor<CLTensor>(shape, DataType::S16);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S16);
-    dst_y.info()->set_format(Format::S16);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create Derivative configure function
-    CLDerivative derivative;
-    derivative.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 1 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // TODO(COMPMID-415) Validate padding after fixing x-access input bug in CL kernel
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLDerivativeFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
                                                                                                          Format::U8)),
                                                                                                  datasets::GradientDimensions()))
diff --git a/tests/validation/CL/Dilate.cpp b/tests/validation/CL/Dilate.cpp
index e6605bbd5a..8bd4bba297 100644
--- a/tests/validation/CL/Dilate.cpp
+++ b/tests/validation/CL/Dilate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,41 +50,6 @@ constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kerne
 TEST_SUITE(CL)
 TEST_SUITE(Dilate)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLDilate dilate;
-    dilate.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using CLDilateFixture = DilateValidationFixture<CLTensor, CLAccessor, CLDilate, T>;
 
diff --git a/tests/validation/CL/DilatedConvolutionLayer.cpp b/tests/validation/CL/DilatedConvolutionLayer.cpp
index 20ba113cd8..9a9df2c7e4 100644
--- a/tests/validation/CL/DilatedConvolutionLayer.cpp
+++ b/tests/validation/CL/DilatedConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -113,49 +113,6 @@ TEST_SUITE_END()
 
 TEST_SUITE(GEMMDilatedConvolutionLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallDilatedConvolutionLayerDataset(),
-                                                                   CNNDataTypes),
-               input_shape, weights_shape, bias_shape, output_shape, info, dilation, data_type)
-{
-    auto bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
-
-    // Create tensors
-    CLTensor src     = create_tensor<CLTensor>(input_shape, data_type, 1, QuantizationInfo(2.f / 255.f, 127));
-    CLTensor weights = create_tensor<CLTensor>(weights_shape, data_type, 1, QuantizationInfo(2.f / 255.f, 127));
-    CLTensor bias    = create_tensor<CLTensor>(bias_shape, bias_data_type, 1, QuantizationInfo(2.f / 255.f, 127));
-    CLTensor dst     = create_tensor<CLTensor>(output_shape, data_type, 1, QuantizationInfo(2.f / 255.f, 127));
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    const QuantizationInfo src_quantization_info     = src.info()->quantization_info();
-    const QuantizationInfo weights_quantization_info = weights.info()->quantization_info();
-
-    // Create and configure function
-    CLGEMMConvolutionLayer conv;
-    conv.configure(&src, &weights, &bias, &dst, info, WeightsInfo(), dilation);
-
-    // Validate valid region
-    const ValidRegion src_valid_region     = shape_to_valid_region(input_shape);
-    const ValidRegion weights_valid_region = shape_to_valid_region(weights_shape);
-    const ValidRegion bias_valid_region    = shape_to_valid_region(bias_shape);
-    const ValidRegion dst_valid_region     = shape_to_valid_region(output_shape);
-
-    validate(src.info()->valid_region(), src_valid_region);
-    validate(weights.info()->valid_region(), weights_valid_region);
-    validate(bias.info()->valid_region(), bias_valid_region);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate QuantizationInfo
-    ARM_COMPUTE_EXPECT(src.info()->quantization_info() == src_quantization_info, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->quantization_info() == weights_quantization_info, framework::LogLevel::ERRORS);
-
-    // Validate padding
-    //TODO(COMPMID-415) Need to validate padding?
-}
-
 template <typename T>
 using CLGEMMDilatedConvolutionLayerFixture = ConvolutionValidationFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
 
diff --git a/tests/validation/CL/DirectConvolutionLayer.cpp b/tests/validation/CL/DirectConvolutionLayer.cpp
index 090bd22ed9..ae2f22dd1e 100644
--- a/tests/validation/CL/DirectConvolutionLayer.cpp
+++ b/tests/validation/CL/DirectConvolutionLayer.cpp
@@ -277,7 +277,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture<uint8_
 FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(data_precommit_9x9,
                        framework::dataset::make("DataType",
                                                 DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(3.f / 255, 10), QuantizationInfo(1.1f, 10) })),
                        QuantizedActivationFunctionsDataset),
                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
@@ -296,7 +296,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture<uint8_
 FIXTURE_DATA_TEST_CASE(RunLarge9x9, CLDirectConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(data_nightly_9x9,
                        framework::dataset::make("DataType",
                                                 DataType::QASYMM8)),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(3.f / 255, 10), QuantizationInfo(1.1f, 10) })),
                        QuantizedActivationFunctionsDataset),
                        framework::dataset::make("DataLayout", { DataLayout::NCHW })))
 {
diff --git a/tests/validation/CL/ElementwiseMax.cpp b/tests/validation/CL/ElementwiseMax.cpp
index bdc47ee10b..b9444b2795 100644
--- a/tests/validation/CL/ElementwiseMax.cpp
+++ b/tests/validation/CL/ElementwiseMax.cpp
@@ -46,7 +46,6 @@ namespace
 RelativeTolerance<float> tolerance_fp32(0.000001f);
 RelativeTolerance<float> tolerance_fp16(0.001f);
 
-constexpr unsigned int num_elems_processed_per_iteration = 16;
 /** Input data sets **/
 const auto ElementwiseMaxU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType",
                                              DataType::U8));
@@ -82,23 +81,20 @@ TEST_SUITE(ElementwiseMax)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),      // Window shrink
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false, false})),
+               framework::dataset::make("Expected", { true, true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLElementwiseMax::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
@@ -111,29 +107,6 @@ using CLElementwiseMaxFixture = ElementwiseMaxValidationFixture<CLTensor, CLAcce
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::U8);
-
-    // Create and Configure function
-    CLElementwiseMax max;
-    max.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseMaxU8Dataset))
 {
     // Validate output
@@ -142,29 +115,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFixture<uint8_t>, framework::Da
 TEST_SUITE_END()
 
 TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-               shape, data_type)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::S16);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::S16);
-
-    // Create and Configure function
-    CLElementwiseMax max;
-    max.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMaxS16Dataset))
 {
     // Validate output
@@ -178,29 +128,6 @@ using CLElementwiseMaxQuantizedFixture = ElementwiseMaxValidationQuantizedFixtur
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-
-    // Create and Configure function
-    CLElementwiseMax max;
-    max.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMaxQASYMM8Dataset),
                                                                                                                        framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
@@ -212,29 +139,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxQuantizedFixture<uint8_t>, fram
 }
 TEST_SUITE_END()
 TEST_SUITE(QASYMM8_SIGNED)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-
-    // Create and Configure function
-    CLElementwiseMax max;
-    max.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                       ElementwiseMaxQASYMM8SignedDataset),
                                                                                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
@@ -246,29 +150,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxQuantizedFixture<int8_t>, frame
 }
 TEST_SUITE_END()
 TEST_SUITE(QSYMM16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-
-    // Create and Configure function
-    CLElementwiseMax max;
-    max.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMaxQSYMM16Dataset),
                                                                                                                        framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
@@ -300,29 +181,6 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMaxFloatFixture<half>, fr
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    CLElementwiseMax max;
-    max.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMaxFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMaxFP32Dataset),
                                                                                                            EmptyActivationFunctionsDataset))
 {
diff --git a/tests/validation/CL/ElementwiseMin.cpp b/tests/validation/CL/ElementwiseMin.cpp
index a7caac3841..8f53b241ab 100644
--- a/tests/validation/CL/ElementwiseMin.cpp
+++ b/tests/validation/CL/ElementwiseMin.cpp
@@ -46,7 +46,6 @@ namespace
 RelativeTolerance<float> tolerance_fp32(0.000001f);
 RelativeTolerance<float> tolerance_fp16(0.001f);
 
-constexpr unsigned int num_elems_processed_per_iteration = 16;
 /** Input data sets **/
 const auto ElementwiseMinU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType",
                                              DataType::U8));
@@ -82,23 +81,20 @@ TEST_SUITE(ElementwiseMin)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),      // Window shrink
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false, false})),
+               framework::dataset::make("Expected", { true, true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLElementwiseMin::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
@@ -111,29 +107,6 @@ using CLElementwiseMinFixture = ElementwiseMinValidationFixture<CLTensor, CLAcce
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::U8);
-
-    // Create and Configure function
-    CLElementwiseMin min;
-    min.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseMinU8Dataset))
 {
     // Validate output
@@ -142,29 +115,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFixture<uint8_t>, framework::Da
 TEST_SUITE_END()
 
 TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-               shape, data_type)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::S16);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::S16);
-
-    // Create and Configure function
-    CLElementwiseMin min;
-    min.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseMinS16Dataset))
 {
     // Validate output
@@ -178,29 +128,6 @@ using CLElementwiseMinQuantizedFixture = ElementwiseMinValidationQuantizedFixtur
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-
-    // Create and Configure function
-    CLElementwiseMin min;
-    min.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMinQASYMM8Dataset),
                                                                                                                        framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
@@ -212,29 +139,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinQuantizedFixture<uint8_t>, fram
 }
 TEST_SUITE_END()
 TEST_SUITE(QASYMM8_SIGNED)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-
-    // Create and Configure function
-    CLElementwiseMin min;
-    min.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                       ElementwiseMinQASYMM8SignedDataset),
                                                                                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
@@ -246,29 +150,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinQuantizedFixture<int8_t>, frame
 }
 TEST_SUITE_END()
 TEST_SUITE(QSYMM16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-
-    // Create and Configure function
-    CLElementwiseMin min;
-    min.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                        ElementwiseMinQSYMM16Dataset),
                                                                                                                        framework::dataset::make("SrcQInfo0", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
@@ -300,29 +181,6 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseMinFloatFixture<half>, fr
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    CLElementwiseMin min;
-    min.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseMinFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseMinFP32Dataset),
                                                                                                            EmptyActivationFunctionsDataset))
 {
diff --git a/tests/validation/CL/ElementwisePower.cpp b/tests/validation/CL/ElementwisePower.cpp
index 2cafdbb84e..a2d3ba6c09 100644
--- a/tests/validation/CL/ElementwisePower.cpp
+++ b/tests/validation/CL/ElementwisePower.cpp
@@ -67,23 +67,20 @@ TEST_SUITE(ElementwisePower)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),      // Window shrink
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false, false})),
+               framework::dataset::make("Expected", { true, true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLElementwisePower::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
diff --git a/tests/validation/CL/ElementwiseSquaredDiff.cpp b/tests/validation/CL/ElementwiseSquaredDiff.cpp
index 58eca3fe0b..0a4ab6627b 100644
--- a/tests/validation/CL/ElementwiseSquaredDiff.cpp
+++ b/tests/validation/CL/ElementwiseSquaredDiff.cpp
@@ -47,7 +47,6 @@ RelativeTolerance<float> tolerance_fp32(0.000001f);
 RelativeTolerance<float> tolerance_fp16(0.001f);
 AbsoluteTolerance<float> tolerance_qsymm16(1);
 
-constexpr unsigned int num_elems_processed_per_iteration = 16;
 /** Input data sets **/
 const auto ElementwiseSquaredDiffU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)),
                                                      framework::dataset::make("DataType",
@@ -81,23 +80,20 @@ TEST_SUITE(ElementwiseSquaredDiff)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),      // Window shrink
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false, false})),
+               framework::dataset::make("Expected", { true, true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLElementwiseSquaredDiff::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
@@ -110,29 +106,6 @@ using CLElementwiseSquaredDiffFixture = ElementwiseSquaredDiffValidationFixture<
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::U8);
-
-    // Create and Configure function
-    CLElementwiseSquaredDiff sqdiff;
-    sqdiff.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseSquaredDiffU8Dataset))
 {
     // Validate output
@@ -141,29 +114,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFixture<uint8_t>, frame
 TEST_SUITE_END()
 
 TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-               shape, data_type)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::S16);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::S16);
-
-    // Create and Configure function
-    CLElementwiseSquaredDiff sqdiff;
-    sqdiff.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffS16Dataset))
 {
     // Validate output
@@ -177,29 +127,6 @@ using CLElementwiseSquaredDiffQuantizedFixture = ElementwiseSquaredDiffValidatio
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-
-    // Create and Configure function
-    CLElementwiseSquaredDiff sqdiff;
-    sqdiff.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
                        ElementwiseSquaredDiffQASYMM8Dataset),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
@@ -211,29 +138,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffQuantizedFixture<uint8_
 }
 TEST_SUITE_END()
 TEST_SUITE(QSYMM16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-
-    // Create and Configure function
-    CLElementwiseSquaredDiff sqdiff;
-    sqdiff.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
                        ElementwiseSquaredDiffQSYMM16Dataset),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
@@ -266,29 +170,6 @@ FIXTURE_DATA_TEST_CASE(RunWithActivation, CLElementwiseSquaredDiffFloatFixture<h
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    CLElementwiseSquaredDiff sqdiff;
-    sqdiff.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLElementwiseSquaredDiffFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP32Dataset),
                                                                                                                    EmptyActivationFunctionsDataset))
 {
diff --git a/tests/validation/CL/EqualizeHistogram.cpp b/tests/validation/CL/EqualizeHistogram.cpp
index 6dbe01a509..18047e6ffa 100644
--- a/tests/validation/CL/EqualizeHistogram.cpp
+++ b/tests/validation/CL/EqualizeHistogram.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,30 +39,6 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(EqualizeHistogram)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLEqualizeHistogram equalize_histogram;
-    equalize_histogram.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 8).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using CLEqualizeHistogramFixture = EqualizeHistogramValidationFixture<CLTensor, CLAccessor, CLEqualizeHistogram, T>;
 
diff --git a/tests/validation/CL/Erode.cpp b/tests/validation/CL/Erode.cpp
index 76d30afb50..303e65f727 100644
--- a/tests/validation/CL/Erode.cpp
+++ b/tests/validation/CL/Erode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,41 +50,6 @@ constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kerne
 TEST_SUITE(CL)
 TEST_SUITE(Erode)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLErode erode;
-    erode.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using CLErodeFixture = ErodeValidationFixture<CLTensor, CLAccessor, CLErode, T>;
 
diff --git a/tests/validation/CL/FFT.cpp b/tests/validation/CL/FFT.cpp
index 12d53b522e..1115ddcd8b 100644
--- a/tests/validation/CL/FFT.cpp
+++ b/tests/validation/CL/FFT.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,30 +71,6 @@ constexpr float          tolerance_num = 0.07f; /**< Tolerance number */
 TEST_SUITE(CL)
 TEST_SUITE(FFT1D)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(shapes_1d, data_types),
-               shape, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type, 2);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type, 2);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLFFT1D fft1d;
-    fft1d.configure(&src, &dst, FFT1DInfo());
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
@@ -141,30 +117,6 @@ TEST_SUITE_END() // FFT1D
 
 TEST_SUITE(FFT2D)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(shapes_2d, data_types),
-               shape, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type, 2);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type, 2);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLFFT2D fft2d;
-    fft2d.configure(&src, &dst, FFT2DInfo());
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
diff --git a/tests/validation/CL/FastCorners.cpp b/tests/validation/CL/FastCorners.cpp
index 37ffb51c77..a7b29839b5 100644
--- a/tests/validation/CL/FastCorners.cpp
+++ b/tests/validation/CL/FastCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,8 +44,6 @@ namespace validation
 {
 namespace
 {
-/* Radius of the Bresenham circle around the candidate point */
-const unsigned int bresenham_radius = 3;
 /* Tolerance used to compare corner strengths */
 const AbsoluteTolerance<float> tolerance(0.5f);
 } // namespace
@@ -53,42 +51,6 @@ const AbsoluteTolerance<float> tolerance(0.5f);
 TEST_SUITE(CL)
 TEST_SUITE(FastCorners)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()),
-                                                                                   framework::dataset::make("Format", Format::U8)),
-                                                                           framework::dataset::make("SuppressNonMax", { false, true })),
-                                                                   framework::dataset::make("BorderMode", BorderMode::UNDEFINED)),
-               shape, format, suppress_nonmax, border_mode)
-{
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    std::uniform_real_distribution<float>  real_dist(0, 255);
-
-    const uint8_t constant_border_value = int_dist(gen);
-    const float   threshold             = real_dist(gen);
-
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type_from_format(format));
-    src.info()->set_format(format);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    CLKeyPointArray corners;
-    unsigned int    num_corners;
-
-    // Create and configure function
-    CLFastCorners fast_corners;
-    fast_corners.configure(&src, threshold, suppress_nonmax, &corners, &num_corners, border_mode, constant_border_value);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 1); // elems_processed
-
-    calculator.set_border_size(bresenham_radius);
-    calculator.set_access_offset(-bresenham_radius);
-    calculator.set_accessed_elements(7); // elems_read
-
-    validate(src.info()->padding(), calculator.required_padding());
-}
-
 template <typename T>
 using CLFastCornersFixture = FastCornersValidationFixture<CLTensor, CLAccessor, CLKeyPointArray, CLFastCorners, T>;
 
diff --git a/tests/validation/CL/Fill.cpp b/tests/validation/CL/Fill.cpp
index b86dae10fd..38950079da 100644
--- a/tests/validation/CL/Fill.cpp
+++ b/tests/validation/CL/Fill.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/FillBorder.cpp b/tests/validation/CL/FillBorder.cpp
index e0b283b56b..e2afd6494e 100644
--- a/tests/validation/CL/FillBorder.cpp
+++ b/tests/validation/CL/FillBorder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/Globals.h"
 #include "tests/datasets/BorderModeDataset.h"
diff --git a/tests/validation/CL/Flatten.cpp b/tests/validation/CL/Flatten.cpp
index a00041b0a4..04f720f7e5 100644
--- a/tests/validation/CL/Flatten.cpp
+++ b/tests/validation/CL/Flatten.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/Floor.cpp b/tests/validation/CL/Floor.cpp
index 58645b9d85..2961cfa3f2 100644
--- a/tests/validation/CL/Floor.cpp
+++ b/tests/validation/CL/Floor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/FuseBatchNormalization.cpp b/tests/validation/CL/FuseBatchNormalization.cpp
index 0736250727..548feab2ed 100644
--- a/tests/validation/CL/FuseBatchNormalization.cpp
+++ b/tests/validation/CL/FuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/GEMM.cpp b/tests/validation/CL/GEMM.cpp
index c9540c352a..392eeb1510 100644
--- a/tests/validation/CL/GEMM.cpp
+++ b/tests/validation/CL/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp
index 29649d8c9f..5a1971b54c 100644
--- a/tests/validation/CL/GEMMLowp.cpp
+++ b/tests/validation/CL/GEMMLowp.cpp
@@ -54,27 +54,6 @@ TEST_SUITE(GEMMLowp)
 TEST_SUITE(MatrixMultiplyCore)
 using CLGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset(),
-               shape_a, shape_b, shape_c, a_offset, b_offset)
-{
-    // Create tensors
-    CLTensor a = create_tensor<CLTensor>(shape_a, DataType::QASYMM8);
-    CLTensor b = create_tensor<CLTensor>(shape_b, DataType::QASYMM8);
-    CLTensor c = create_tensor<CLTensor>(shape_c, DataType::S32);
-
-    a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset));
-    b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));
-
-    ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLGEMMLowpMatrixMultiplyCore gemmlowp_mm;
-    // TODO (giaiod01) COMPMID-1672 - Extending the test to validate add bias in offset contribution
-    gemmlowp_mm.configure(&a, &b, nullptr, &c);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
     // Validate output
@@ -206,7 +185,10 @@ TEST_SUITE_END() // BoundedReLu
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // QuantizeDownInt32Scale
 
-TEST_SUITE(QuantizeDownInt32ToUint8ScaleByFixedPoint)
+TEST_SUITE(QuantizeDownInt32ScaleByFixedPoint)
+
+TEST_SUITE(QASYMM8)
+
 const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
                                                                     2)
                                                                     * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", 0) * framework::dataset::make("max", 255) * framework::dataset::make("addBias", { false, true });
@@ -217,47 +199,6 @@ const auto quantize_down_int32_to_uint8_scale_by_fixedpoint_relu_cases = framewo
 using CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture =
     GEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_by_fixedpoint_cases),
-               shape, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, add_bias)
-{
-    TensorShape shape_bias(shape[0]);
-
-    // Create tensors
-    CLTensor in   = create_tensor<CLTensor>(shape, DataType::S32);
-    CLTensor bias = create_tensor<CLTensor>(shape_bias, DataType::S32);
-    CLTensor out  = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-
-    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint output_stage;
-    output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-
-    // Validate valid region input and output
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(in.info()->valid_region(), valid_region);
-    validate(out.info()->valid_region(), valid_region);
-
-    // Validate valid region bias
-    if(add_bias)
-    {
-        const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
-        validate(bias.info()->valid_region(), valid_region_bias);
-    }
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 4).required_padding();
-    validate(in.info()->padding(), padding);
-    validate(out.info()->padding(), padding);
-
-    if(add_bias)
-    {
-        validate(bias.info()->padding(), padding);
-    }
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
                        quantize_down_int32_to_uint8_scale_by_fixedpoint_cases))
 {
@@ -287,57 +228,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedP
     validate(CLAccessor(_target), _reference);
 }
 TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QuantizeDownInt32ToUint8ScaleByFixedPoint
-TEST_SUITE(QuantizeDownInt32ToInt8ScaleByFixedPoint)
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE(QASYMM8_SIGNED)
 const auto quantize_down_int32_to_int8_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, 2)
-                                                                   * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128) * framework::dataset::make("max", 128) * framework::dataset::make("addBias", { false, true });
+                                                                   * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128) * framework::dataset::make("max", 127) * framework::dataset::make("addBias", { false, true });
 
 const auto quantize_down_int32_to_int8_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1, 2)
                                                                         * framework::dataset::make("result_offset_after_shift", 2, 3) * framework::dataset::make("min", -128, -126) * framework::dataset::make("max", 110, 112) * framework::dataset::make("addBias", { false, true });
 using CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture =
     GEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_int8_scale_by_fixedpoint_cases),
-               shape, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, add_bias)
-{
-    TensorShape shape_bias(shape[0]);
-
-    // Create tensors
-    CLTensor in   = create_tensor<CLTensor>(shape, DataType::S32);
-    CLTensor bias = create_tensor<CLTensor>(shape_bias, DataType::S32);
-    CLTensor out  = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-
-    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint output_stage;
-    output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-
-    // Validate valid region input and output
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(in.info()->valid_region(), valid_region);
-    validate(out.info()->valid_region(), valid_region);
-
-    // Validate valid region bias
-    if(add_bias)
-    {
-        const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
-        validate(bias.info()->valid_region(), valid_region_bias);
-    }
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 4).required_padding();
-    validate(in.info()->padding(), padding);
-    validate(out.info()->padding(), padding);
-
-    if(add_bias)
-    {
-        validate(bias.info()->padding(), padding);
-    }
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
                        quantize_down_int32_to_int8_scale_by_fixedpoint_cases))
 {
@@ -354,8 +254,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPo
 }
 
 TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QuantizeDownInt32ToInt8ScaleByFixedPoint
-TEST_SUITE(QuantizeDownInt32ToInt16ScaleByFixedPoint)
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE(QSYMM16)
 
 const auto quantize_down_int32_to_int16_scale_by_fixedpoint_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
                                                                     2)
@@ -380,37 +280,6 @@ const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_case
 using CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture =
     GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint>;
 
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::S32),
-                                             TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Wrong output data type
-                                          }),
-    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                            TensorInfo(TensorShape(21U), 1, DataType::S32),
-                                          })),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U, 13U), 1, DataType::QSYMM16),
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::S32),
-                                           })),
-    framework::dataset::make("Min",{        -205,
-                                            -180,
-                                           })),
-    framework::dataset::make("Max",{        205,
-                                            180,
-                                           })),
-    framework::dataset::make("Expected", { true, false })),
-    a_info, b_info, output_info, min, max, expected)
-{
-    // Lock tensors
-    Status status =  CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&a_info.clone()->set_is_resizable(true),
-                                                                                 &b_info.clone()->set_is_resizable(true),
-                                                                                 &output_info.clone()->set_is_resizable(true),
-                                                                                 min,
-                                                                                 max);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
 TEST_SUITE(NoRelu)
 TEST_SUITE(MultSmallerEq1)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
@@ -447,7 +316,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedP
 }
 TEST_SUITE_END() // MultGreater1
 TEST_SUITE_END() // BoundedReLu
-TEST_SUITE_END() // QuantizeDownInt32ToInt16ScaleByFixedPoint
+TEST_SUITE_END() // QSYMM16
+TEST_SUITE_END() // QuantizeDownInt32ScaleByFixedPoint
 
 TEST_SUITE(QuantizeDownInt32ScaleByFloat)
 
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
index ce000bd8e1..1cfeac59af 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
@@ -21,9 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/framework/Asserts.h"
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp
index 16e4a137eb..0c651cddc2 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/framework/Asserts.h"
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp
index d8618bd881..fa256280ca 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
diff --git a/tests/validation/CL/GEMMMatrixMultiply.cpp b/tests/validation/CL/GEMMMatrixMultiply.cpp
index e521dd5a02..5d2e211d91 100644
--- a/tests/validation/CL/GEMMMatrixMultiply.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiply.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
diff --git a/tests/validation/CL/GEMMMatrixMultiplyInterleavedTransposed.cpp b/tests/validation/CL/GEMMMatrixMultiplyInterleavedTransposed.cpp
index fcbf8ce110..b2701e7f6c 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyInterleavedTransposed.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyInterleavedTransposed.cpp
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
diff --git a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
index 6ba5012d15..ec6b87fbae 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
@@ -21,12 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -186,55 +186,6 @@ void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned
     CLGEMMMatrixMultiplyNative gemm;
     gemm.configure(&lhs, &rhs, &bias, &dst, 1.0f, 1.0f, lhs_info, rhs_info, kernel_info);
 }
-/** Zero padding test */
-bool validate_zero_padding(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, bool broadcast_bias, DataType data_type, const ActivationLayerInfo &act_info)
-{
-    const unsigned int M = m_value;
-    const unsigned int N = n_value;
-    const unsigned int K = k_value;
-
-    GEMMLHSMatrixInfo lhs_info;
-    lhs_info.m0         = m0_value;
-    lhs_info.k0         = k0_value;
-
-    GEMMRHSMatrixInfo rhs_info;
-    rhs_info.n0         = n0_value;
-    rhs_info.k0         = k0_value;
-
-    GEMMKernelInfo kernel_info;
-    kernel_info.m               = M;
-    kernel_info.n               = N;
-    kernel_info.k               = K;
-    kernel_info.broadcast_bias  = broadcast_bias;
-    kernel_info.activation_info = act_info;
-
-    const TensorShape lhs_shape(K, M, b_value);
-    const TensorShape rhs_shape(N, K, b_value);
-    const TensorShape bias_shape(N,
-                                 broadcast_bias? 1 : M,
-                                 broadcast_bias? 1 : b_value);
-    const TensorShape dst_shape = compute_mm_shape(TensorInfo(lhs_shape, 1, data_type),
-                                                   TensorInfo(rhs_shape, 1, data_type),
-                                                   kernel_info);
-
-    // Create tensors
-    CLTensor lhs  = create_tensor<CLTensor>(lhs_shape, data_type);
-    CLTensor rhs  = create_tensor<CLTensor>(rhs_shape, data_type);
-    CLTensor bias = create_tensor<CLTensor>(bias_shape, data_type);
-    CLTensor dst  = create_tensor<CLTensor>(dst_shape, data_type);
-
-    ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLGEMMMatrixMultiplyNative gemm;
-    gemm.configure(&lhs, &rhs, &bias, &dst, 1.0f, 1.0f, lhs_info, rhs_info, kernel_info);
-
-    // Padding can be added along rhs and bias's X dimension
-    return dst.info()->padding().empty() && lhs.info()->padding().empty() && bias.info()->padding().bottom == 0 && bias.info()->padding().top == 0;
-}
 } // namespace
 
 TEST_SUITE(CL)
@@ -256,29 +207,6 @@ m_value, n_value, k_value, b_value, m0_value, n0_value, k0_value, broadcast_bias
     validate_configuration(m_value, n_value, k_value, b_value, m0_value, n0_value, k0_value, broadcast_bias, DataType::F32, act_value);
 }
 
-/** Validate zero padding tests
- *
- * A series of validation tests to check that no padding is added as part of configuration for 4 different scenarios.
- *
- * Checks performed in order:
- *     - No partial blocks in both x and y dimensions
- *     - Partial blocks in x dimension
- *     - Partial blocks in y dimension
- *     - Partial blocks in both x and y dimensions
- *     - No blocks in both x and y dimensions, scalar store (N0==1)
- *     - Special case: partial_n0 == 5 (vstore1 should be invoked instead of vstore_partial_1)
- */
-DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip(
-framework::dataset::make("M",                   { 24, 64, 101,   1, 50, 256, }),
-framework::dataset::make("N",                   { 48, 29,  16, 122, 20,  21, })),
-framework::dataset::make("M0",                  { 4,   8,   7,   2,  1,   8, })),
-framework::dataset::make("N0",                  { 4,   4,  16,   3,  1,   8, })),
-m_value, n_value, m0_value, n0_value)
-{
-    bool status = validate_zero_padding(m_value, n_value, 23, 1, m0_value, n0_value, 4, false, DataType::F32, ActivationLayerInfo());
-    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmallBoundaryHandlingPartialInXPartialInY, CLGEMMMatrixMultiplyNativeFixture<float>, framework::DatasetMode::ALL,
                 combine(combine(
                         framework::dataset::make("M", 3),
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
index d7853f3ea7..52afb716e4 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
@@ -21,14 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -139,13 +139,13 @@ const auto a_values_nightly = framework::dataset::make("alpha", {1.0f} );
 const auto beta_values_nightly = framework::dataset::make("beta", {1.0f} );
 
 /** M0 values to test - Nightly */
-const auto m0_values_nightly = framework::dataset::make("M0", { 2, 3, 4, 8 });
+const auto m0_values_nightly = framework::dataset::make("M0", { 8 });
 
 /** N0 values to test - Nightly */
-const auto n0_values_nightly = framework::dataset::make("N0", { 2, 3, 4, 8 });
+const auto n0_values_nightly = framework::dataset::make("N0", { 8 });
 
 /** K0 values to test - Nightly */
-const auto k0_values_nightly = framework::dataset::make("K0", { 2, 3, 4, 8 });
+const auto k0_values_nightly = framework::dataset::make("K0", { 4 });
 
 /** N0 values to test with export to OpenCL image object - Nightly */
 const auto n0_export_to_cl_image_values_nightly = framework::dataset::make("N0", { 4, 8, 16 });
@@ -154,10 +154,10 @@ const auto n0_export_to_cl_image_values_nightly = framework::dataset::make("N0",
 const auto k0_export_to_cl_image_values_nightly = framework::dataset::make("K0", { 4, 8, 16 });
 
 /** V0 values to test - Nightly */
-const auto v0_values_nightly = framework::dataset::make("V0", 1, 4);
+const auto v0_values_nightly = framework::dataset::make("V0", 1, 3);
 
 /** H0 values to test - Nightly */
-const auto h0_values_nightly = framework::dataset::make("H0", 1, 4);
+const auto h0_values_nightly = framework::dataset::make("H0", 1, 3);
 
 /** Interleave values to test with LHS matrix */
 const auto i_values_lhs = framework::dataset::make("interleave_lhs", { true, false });
@@ -171,100 +171,11 @@ const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", {
 /** LHS transposed values */
 const auto lhs_transpose_values = framework::dataset::make("lhs_transpose", { false, true } );
 
-/** Zero padding test */
-bool validate_zero_padding(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value,
-                            unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value,
-                            bool i_value_rhs, bool t_value_rhs, bool export_to_cl_image, bool broadcast_bias, unsigned int depth_output_gemm3d, const ActivationLayerInfo &act_info,
-                            DataType dt_input0, DataType dt_input1, DataType dt_input2, DataType dt_output, float alpha, float beta)
-{
-    const unsigned int M = m_value;
-    const unsigned int N = n_value;
-    const unsigned int K = k_value;
-
-    GEMMLHSMatrixInfo lhs_info;
-    lhs_info.m0         = m0_value;
-    lhs_info.k0         = k0_value;
-
-    GEMMRHSMatrixInfo rhs_info;
-    rhs_info.n0         = n0_value;
-    rhs_info.k0         = k0_value;
-    rhs_info.h0         = h0_value;
-    rhs_info.interleave = i_value_rhs;
-    rhs_info.transpose  = t_value_rhs;
-    rhs_info.export_to_cl_image = export_to_cl_image;
-
-    GEMMKernelInfo kernel_info;
-    kernel_info.m                       = M;
-    kernel_info.n                       = N;
-    kernel_info.k                       = K;
-    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    kernel_info.reinterpret_input_as_3d = false;
-    kernel_info.broadcast_bias          = broadcast_bias;
-    kernel_info.activation_info         = act_info;
-
-    const TensorShape lhs_shape(K, M, b_value);
-    const TensorShape rhs_shape(N, K, b_value);
-    const TensorShape lhs_shape_reshaped = compute_lhs_reshaped_shape(TensorInfo(lhs_shape, 1, dt_input0),
-                                                                      lhs_info);
-    const TensorShape rhs_shape_reshaped = compute_rhs_reshaped_shape(TensorInfo(rhs_shape, 1, dt_input1),
-                                                                      rhs_info);
-
-    const TensorShape dst_shape = compute_mm_shape(TensorInfo(lhs_shape_reshaped, 1, dt_input0),
-                                                   TensorInfo(rhs_shape_reshaped, 1, dt_input1),
-                                                   kernel_info);
-
-    const TensorShape bias_shape(N,
-                                 M, // Correct calculation should be: broadcast_bias? 1 : M, it's wrong here on purpose just for validation test
-                                 broadcast_bias? 1 : b_value);
-
-    // Create tensors
-    CLTensor lhs_reshaped  = create_tensor<CLTensor>(lhs_shape_reshaped, dt_input0);
-    CLTensor rhs_reshaped  = create_tensor<CLTensor>(rhs_shape_reshaped, dt_input1);
-    CLTensor bias = create_tensor<CLTensor>(bias_shape, dt_input2);
-    CLTensor dst  = create_tensor<CLTensor>(dst_shape, dt_output);
-
-    ARM_COMPUTE_EXPECT(lhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Validate zero-padding
-    CLGEMMMatrixMultiplyReshaped gemm;
-
-    gemm.configure(&lhs_reshaped, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
-
-    // Padding can be added along rhs and bias's X/Y dimension
-    return dst.info()->padding().empty() && lhs_reshaped.info()->padding().empty();
-}
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(GEMMMatrixMultiplyReshaped)
 
-/** Validate zero padding tests
- *
- * A series of validation tests to check the zero padding requirement
- *
- * Checks performed in order:
- *     - No partial blocks in both x and y dimensions
- *     - Partial blocks in x dimension
- *     - Partial blocks in y dimension
- *     - Partial blocks in both x and y dimensions
- *     - Special case: partial_n0 == 9 (vstore1 should be invoked instead of vstore_partial_1)
- */
-DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip(
-framework::dataset::make("M",                   { 24, 64, 101,   1, 103 }),
-framework::dataset::make("N",                   { 48, 29,  16, 121,  41 })),
-framework::dataset::make("M0",                  {  4,  8,   4,   2,   4 })),
-framework::dataset::make("N0",                  {  4,  4,  16,   2,  16 })),
-m_value, n_value, m0_value, n0_value)
-{
-    constexpr DataType dt = DataType::F32;
-
-    bool status = validate_zero_padding(m_value, n_value, 23, 1, m0_value, n0_value, 4, 1, false, false, false, 0, 0, ActivationLayerInfo(), dt, dt, dt, dt, 1.0f, 1.0f);
-    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
@@ -340,6 +251,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                                      false /**< reinterpret the input as 3D */,
                                                                      true  /**< Flag used to broadcast the bias addition */,
                                                                      false /**< wider accumm */,
+                                                                     false /**< has pad y */,
                                                                    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
                                                                      1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                                      1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
@@ -354,6 +266,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                                      false /**< reinterpret the input as 3D */,
                                                                      true  /**< Flag used to broadcast the bias addition */,
                                                                      false /**< wider accumm */,
+                                                                     false /**< has pad y */,
                                                                    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
                                                                      1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                                      1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
@@ -371,6 +284,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                                      false /**< reinterpret the input as 3D */,
                                                                      false  /**< Flag used to broadcast the bias addition */,
                                                                      false /**< wider accumm */,
+                                                                     false /**< has pad y */,
                                                                    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
                                                                      1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                                      1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
@@ -386,6 +300,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                                      false /**< reinterpret the input as 3D */,
                                                                      false  /**< Flag used to broadcast the bias addition */,
                                                                      true /**< wider accumm */,
+                                                                     true /**< has pad y */,
                                                                    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
                                                                      1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                                      1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
@@ -400,6 +315,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                                      false /**< reinterpret the input as 3D */,
                                                                      false  /**< Flag used to broadcast the bias addition */,
                                                                      false /**< wider accumm */,
+                                                                     false /**< has pad y */,
                                                                    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
                                                                      1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                                      1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
@@ -573,6 +489,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                              false /**< reinterpret the input as 3D */,
                                                              true  /**< Flag used to broadcast the bias addition */,
                                                              false /**< wider accumm */,
+                                                             false /**< has pad y */,
                                                            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
                                                              1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                              1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
@@ -586,6 +503,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                              false /**< reinterpret the input as 3D */,
                                                              true  /**< Flag used to broadcast the bias addition */,
                                                              false /**< wider accumm */,
+                                                             false /**< has pad y */,
                                                            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
                                                              1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                              1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
@@ -599,6 +517,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                              false /**< reinterpret the input as 3D */,
                                                              true  /**< Flag used to broadcast the bias addition */,
                                                              false /**< wider accumm */,
+                                                             false /**< has pad y */,
                                                            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
                                                              1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                              1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
@@ -613,6 +532,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                              false /**< reinterpret the input as 3D */,
                                                              true  /**< Flag used to broadcast the bias addition */,
                                                              false /**< wider accumm */,
+                                                             false /**< has pad y */,
                                                            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
                                                              1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                              1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
@@ -626,6 +546,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                              false /**< reinterpret the input as 3D */,
                                                              true  /**< Flag used to broadcast the bias addition */,
                                                              false /**< wider accumm */,
+                                                             false /**< has pad y */,
                                                            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
                                                              1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                              1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
@@ -671,8 +592,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture<float>, fra
                                                                    lhs_transpose_values),
                                                                    act_values))
 {
-    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
-    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+     // Validate output only if validate() is successful
+    if(validate_result)
     {
         validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
     }
@@ -705,8 +626,8 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture<float>, fra
                                                                    lhs_transpose_values),
                                                                    act_values))
 {
-    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
-    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+     // Validate output only if validate() is successful
+    if(validate_result)
     {
         validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
     }
@@ -738,8 +659,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
                                                                    lhs_transpose_values),
                                                                    act_values))
 {
-    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
-    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+     // Validate output only if validate() is successful
+    if(validate_result)
     {
         validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
     }
@@ -771,8 +692,8 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
                                                                    lhs_transpose_values),
                                                                    act_values))
 {
-    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
-    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    // Validate output only if validate() is successful
+    if(validate_result)
     {
         validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
     }
@@ -886,6 +807,274 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>,
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
 }
+
+TEST_SUITE(ExportToCLImage)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
+                                                        TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
+                                                        TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
+                                                        TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16),  // Incorrect k0
+                                                        TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16),  // Incorrect n0
+
+                                                      }),
+               framework::dataset::make("Input1Info",{ TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(512U, 8U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(128U, 32U, 2U), 1, DataType::F16),
+
+                      })),
+               framework::dataset::make("Input2Info", { TensorInfo(TensorShape(64U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(64U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(64U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(64U), 1, DataType::F16),
+                                                        TensorInfo(TensorShape(64U), 1, DataType::F16),
+
+                                                      })),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F16),
+
+                           })),
+               framework::dataset::make("LHSMInfo",{
+                                                          GEMMLHSMatrixInfo(4, 4, 1, false, true),
+                                                          GEMMLHSMatrixInfo(4, 8, 1, false, true),
+                                                          GEMMLHSMatrixInfo(4, 4, 1, false, true),
+                                                          GEMMLHSMatrixInfo(4, 2, 1, false, false),
+                                                          GEMMLHSMatrixInfo(4, 4, 1, false, false),
+
+                                })),
+               framework::dataset::make("RHSMInfo",{
+                                                          GEMMRHSMatrixInfo(4, 4, 1, true, true, true),
+                                                          GEMMRHSMatrixInfo(4, 8, 1, true, true, true),
+                                                          GEMMRHSMatrixInfo(8, 4, 1, true, true, true),
+                                                          GEMMRHSMatrixInfo(4, 2, 1, true, false, true),
+                                                          GEMMRHSMatrixInfo(2, 4, 1, true, false, true),
+                           })),
+               framework::dataset::make("GEMMInfo",{GEMMKernelInfo( 64 /**<M Number of LHS rows*/,
+                                                                    64 /**<N Number of RHS columns*/,
+                                                                    64 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                             false /**< reinterpret the input as 3D */,
+                                                             true  /**< Flag used to broadcast the bias addition */,
+                                                             false /**< wider accumm */,
+                                                             false /**< has pad y */,
+                                                           ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                             1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                             GEMMLHSMatrixInfo(),
+                                                             GEMMRHSMatrixInfo(),
+                                                             0  /**< Offset to be added to each element of the matrix A */,
+                                                             0 /**< Offset to be added to each element of the matrix B */),
+                                                    GEMMKernelInfo( 64 /**<M Number of LHS rows*/,
+                                                                    64 /**<N Number of RHS columns*/,
+                                                                    64 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                             false /**< reinterpret the input as 3D */,
+                                                             true  /**< Flag used to broadcast the bias addition */,
+                                                             false /**< wider accumm */,
+                                                             false /**< has pad y */,
+                                                           ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                             1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                             GEMMLHSMatrixInfo(),
+                                                             GEMMRHSMatrixInfo(),
+                                                             0  /**< Offset to be added to each element of the matrix A */,
+                                                             0 /**< Offset to be added to each element of the matrix B */),
+                                                    GEMMKernelInfo( 64 /**<M Number of LHS rows*/,
+                                                                    64 /**<N Number of RHS columns*/,
+                                                                    64 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                             false /**< reinterpret the input as 3D */,
+                                                             true  /**< Flag used to broadcast the bias addition */,
+                                                             false /**< wider accumm */,
+                                                             false /**< has pad y */,
+                                                           ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                             1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                             GEMMLHSMatrixInfo(),
+                                                             GEMMRHSMatrixInfo(),
+                                                             0  /**< Offset to be added to each element of the matrix A */,
+                                                             0 /**< Offset to be added to each element of the matrix B */),
+
+                                                    GEMMKernelInfo( 64 /**<M Number of LHS rows*/,
+                                                                    64 /**<N Number of RHS columns*/,
+                                                                    64 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                             false /**< reinterpret the input as 3D */,
+                                                             true  /**< Flag used to broadcast the bias addition */,
+                                                             false /**< wider accumm */,
+                                                             false /**< has pad y */,
+                                                           ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                             1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                             GEMMLHSMatrixInfo(),
+                                                             GEMMRHSMatrixInfo(),
+                                                             0  /**< Offset to be added to each element of the matrix A */,
+                                                             0 /**< Offset to be added to each element of the matrix B */),
+                                                    GEMMKernelInfo( 64 /**<M Number of LHS rows*/,
+                                                                    64 /**<N Number of RHS columns*/,
+                                                                    64 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                             false /**< reinterpret the input as 3D */,
+                                                             true  /**< Flag used to broadcast the bias addition */,
+                                                             false /**< wider accumm */,
+                                                             false /**< has pad y */,
+                                                           ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                             1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                             GEMMLHSMatrixInfo(),
+                                                             GEMMRHSMatrixInfo(),
+                                                             0  /**< Offset to be added to each element of the matrix A */,
+                                                             0 /**< Offset to be added to each element of the matrix B */)
+                                                    })),
+               framework::dataset::make("Expected", { true,
+                                                      true,
+                                                      true,
+                                                      false,
+                                                      false})),
+                    input0_info ,input1_info, input2_info, output_info, lhs_info, rhs_info, gemm_info, expected)
+{
+   ARM_COMPUTE_EXPECT(bool(CLGEMMMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
+                                                          &input1_info.clone()->set_is_resizable(true),
+                                                          &input2_info.clone()->set_is_resizable(true),
+                                                          &output_info.clone()->set_is_resizable(true),1.f,1.f,
+                                                          lhs_info,
+                                                          rhs_info,
+                                                          gemm_info)) == (expected && image2d_from_buffer_supported(CLKernelLibrary::get().get_device())), framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   i_values_lhs),
+                                                                   i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   broadcast_bias_values),
+                                                                   lhs_transpose_values),
+                                                                   act_values))
+{
+    // Validate output only if validate() is successful
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture<half>, framework::DatasetMode::NIGHTLY,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_nightly),
+                                                                   n0_export_to_cl_image_values_nightly),
+                                                                   k0_export_to_cl_image_values_nightly),
+                                                                   v0_values_nightly),
+                                                                   h0_values_nightly),
+                                                                   i_values_lhs),
+                                                                   i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values_nightly),
+                                                                   beta_values_nightly),
+                                                                   broadcast_bias_values),
+                                                                   lhs_transpose_values),
+                                                                   act_values))
+{
+    // Validate output only if validate() is successful
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_w_values,
+                                                                   m_h_values),
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   i_values_lhs),
+                                                                   i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   lhs_transpose_values),
+                                                                   act_values))
+{
+    // Validate output only if validate() is successful
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>, framework::DatasetMode::NIGHTLY,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_w_values,
+                                                                   m_h_values),
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_nightly),
+                                                                   n0_export_to_cl_image_values_nightly),
+                                                                   k0_export_to_cl_image_values_nightly),
+                                                                   v0_values_nightly),
+                                                                   h0_values_nightly),
+                                                                   i_values_lhs),
+                                                                   i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values_nightly),
+                                                                   beta_values_nightly),
+                                                                   lhs_transpose_values),
+                                                                   act_values))
+{
+    // Validate output only if validate() is successful
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP16
 
 TEST_SUITE(MixedPrecision)
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
index bd0cd03ca7..7cde3d04ca 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -74,7 +74,7 @@ constexpr float          abs_tolerance_f16(0.01f);
 const auto a_values = framework::dataset::make("alpha", {-0.75f} );
 
 /** Beta values to test */
-const auto beta_values = framework::dataset::make("beta", {-0.35f, 0.0f} );
+const auto beta_values = framework::dataset::make("beta", {-0.35f} );
 
 /** M values to test */
 const auto m_values = framework::dataset::make("M", 37);
@@ -92,13 +92,12 @@ const auto n_values = framework::dataset::make("N", 51);
 const auto k_values = framework::dataset::make("K", 23);
 
 /** Batch size values to test */
-const auto b_values = framework::dataset::make("batch_size", 1, 3);
+const auto b_values = framework::dataset::make("batch_size", 2);
 
 /** Activation values to test */
 const auto act_values = framework::dataset::make("Activation",
 {
-    ActivationLayerInfo(),
-    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 8.f, 2.f),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, -0.8f, 10.f),
 });
 
 /** M0 values to test - precommit */
@@ -211,70 +210,6 @@ bool validate_configuration(unsigned int m_value, unsigned int n_value, unsigned
     CLGEMMMatrixMultiplyReshapedOnlyRHS gemm;
     return bool(gemm.validate(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info));
 }
-
-/** Zero padding test */
-bool validate_zero_padding(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value,
-                            unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value,
-                            bool i_value_rhs, bool t_value_rhs, bool export_to_cl_image, bool broadcast_bias, bool input_as_3d, unsigned int depth_output_gemm3d, const ActivationLayerInfo &act_info,
-                            DataType dt_input0, DataType dt_input1, DataType dt_input2, DataType dt_output, float alpha, float beta)
-{
-    const unsigned int M = m_value;
-    const unsigned int N = n_value;
-    const unsigned int K = k_value;
-
-    GEMMLHSMatrixInfo lhs_info;
-    lhs_info.m0         = m0_value;
-    lhs_info.k0         = k0_value;
-
-    GEMMRHSMatrixInfo rhs_info;
-    rhs_info.n0         = n0_value;
-    rhs_info.k0         = k0_value;
-    rhs_info.h0         = h0_value;
-    rhs_info.interleave = i_value_rhs;
-    rhs_info.transpose  = t_value_rhs;
-    rhs_info.export_to_cl_image = export_to_cl_image;
-
-    GEMMKernelInfo kernel_info;
-    kernel_info.m                       = M;
-    kernel_info.n                       = N;
-    kernel_info.k                       = K;
-    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
-    kernel_info.reinterpret_input_as_3d = input_as_3d;
-    kernel_info.broadcast_bias          = broadcast_bias;
-    kernel_info.activation_info         = act_info;
-
-    const TensorShape lhs_shape(K, M, b_value);
-    const TensorShape rhs_shape(N, K, b_value);
-    const TensorShape rhs_shape_reshaped = compute_rhs_reshaped_shape(TensorInfo(rhs_shape, 1, dt_input1),
-                                                                      rhs_info);
-
-    const TensorShape dst_shape = compute_mm_shape(TensorInfo(lhs_shape, 1, dt_input0),
-                                                   TensorInfo(rhs_shape_reshaped, 1, dt_input1),
-                                                   kernel_info);
-
-    const TensorShape bias_shape(N,
-                                 M, // Correct calculation should be: broadcast_bias? 1 : M, it's wrong here on purpose just for validation test
-                                 broadcast_bias? 1 : b_value);
-
-    // Create tensors
-    CLTensor lhs  = create_tensor<CLTensor>(lhs_shape, dt_input0);
-    CLTensor rhs_reshaped  = create_tensor<CLTensor>(rhs_shape_reshaped, dt_input1);
-    CLTensor bias = create_tensor<CLTensor>(bias_shape, dt_input2);
-    CLTensor dst  = create_tensor<CLTensor>(dst_shape, dt_output);
-
-    ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Validate zero-padding
-    CLGEMMMatrixMultiplyReshapedOnlyRHS gemm;
-
-    gemm.configure(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
-
-    // Padding can be added along rhs and bias's X dimension
-    return dst.info()->padding().empty() && lhs.info()->padding().empty() && bias.info()->padding().bottom == 0 && bias.info()->padding().top == 0;
-}
 } // namespace
 
 TEST_SUITE(CL)
@@ -295,7 +230,7 @@ TEST_SUITE(GEMMMatrixMultiplyReshapedOnlyRHS)
  *     - Incorrect input0 dimension when input is reinterpreted as 3D: input0->dimension(1) * input0->dimension(2) != m
  *     - Correct support for creating an OpenCL image object from buffer
  *     - Incorrect support for creating an OpenCL image object from buffer. N0 is 2 but it can only be 4,8 and 16
- *     - Incorrect support for creating an OpenCL image object from buffer. Data type is F16 but it can only be F32
+ *     - Correct F16 support for creating an OpenCL image object from buffer.
  */
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
 framework::dataset::make("batch_size",          { 1, 1, 1, 1, 1, 1, 2, 1, 1, 1 }),
@@ -311,7 +246,7 @@ framework::dataset::make("data_type_input1",    { DataType::F32, DataType::F32,
 framework::dataset::make("data_type_input2",    { DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F16})),
 framework::dataset::make("data_type_output",    { DataType::F16, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F16})),
 framework::dataset::make("Beta",                { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f , 1.0f})),
-framework::dataset::make("Expected",            { false, false, false, false, false, false, false, true, false, false })),
+framework::dataset::make("Expected",            { false, false, false, false, false, false, false, true, false, true })),
 b_value, m0_value, n0_value, k0_value, broadcast_bias, input_as_3d, depth_output_gemm3d, export_to_cl_image, dt_input0, dt_intpu1, dt_input2, dt_output, beta, expected)
 {
     bool expected_value = expected;
@@ -326,33 +261,6 @@ b_value, m0_value, n0_value, k0_value, broadcast_bias, input_as_3d, depth_output
     ARM_COMPUTE_EXPECT(status == expected_value, framework::LogLevel::ERRORS);
 }
 
-/** Validate zero padding tests
- *
- * A series of validation tests to check that no padding is added as part of configuration for 4 different scenarios.
- *
- * Checks performed in order:
- *     - No partial blocks in both x and y dimensions
- *     - Partial blocks in x dimension
- *     - Partial blocks in y dimension
- *     - Partial blocks in both x and y dimensions
- *     - Special case: partial_n0 == 9 (vstore1 should be invoked instead of vstore_partial_1)
- */
-DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip(zip(
-framework::dataset::make("M",                   { 24, 64, 101,   1, 100 }),
-framework::dataset::make("N",                   { 48, 29,  16, 122,  41 })),
-framework::dataset::make("M0",                  {  4,  8,   7,   2,   1 })),
-framework::dataset::make("N0",                  {  4,  4,  16,   3,  16 })),
-framework::dataset::make("export_to_cl_image",  { false, true, true, false, false })),
-m_value, n_value, m0_value, n0_value, export_to_cl_image)
-{
-    constexpr DataType dt = DataType::F32;
-    // Disable export_to_cl_image if the target platform does not support the OpenCL cl_khr_image2d_from_buffer extension
-    bool actual_export_to_cl_image = image2d_from_buffer_supported(CLKernelLibrary::get().get_device()) && export_to_cl_image;
-
-    bool status = validate_zero_padding(m_value, n_value, 23, 1, m0_value, n0_value, 4, 1, false, false, actual_export_to_cl_image, false, 0, 0, ActivationLayerInfo(), dt, dt, dt, dt, 1.0f, 1.0f);
-    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
-}
-
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
@@ -363,7 +271,15 @@ FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingPartialInXPartialInY, CLGEMMM
                         boundary_handling_cases))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingPartialInXFullInY, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
@@ -373,7 +289,15 @@ FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingPartialInXFullInY, CLGEMMMatr
                         boundary_handling_cases))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingFullInXFullInY, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
@@ -383,7 +307,15 @@ FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingFullInXFullInY, CLGEMMMatrixM
                         boundary_handling_cases))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingFullInXPartialInY, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
@@ -393,102 +325,17 @@ FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingFullInXPartialInY, CLGEMMMatr
                         boundary_handling_cases))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   h0_values),
-                                                                   i_values_rhs),
-                                                                   t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   a_values),
-                                                                   beta_values),
-                                                                   broadcast_bias_values),
-                                                                   act_values))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::NIGHTLY,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_values,
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_nightly),
-                                                                   n0_values_nightly),
-                                                                   k0_values_nightly),
-                                                                   h0_values),
-                                                                   i_values_rhs),
-                                                                   t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   a_values),
-                                                                   beta_values),
-                                                                   broadcast_bias_values),
-                                                                   act_values))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<float>, framework::DatasetMode::PRECOMMIT,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_w_values,
-                                                                   m_h_values),
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_precommit),
-                                                                   n0_values_precommit),
-                                                                   k0_values_precommit),
-                                                                   h0_values),
-                                                                   i_values_rhs),
-                                                                   t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   a_values),
-                                                                   beta_values),
-                                                                   act_values))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<float>, framework::DatasetMode::NIGHTLY,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
-                                                                   m_w_values,
-                                                                   m_h_values),
-                                                                   n_values),
-                                                                   k_values),
-                                                                   b_values),
-                                                                   m0_values_nightly),
-                                                                   n0_values_nightly),
-                                                                   k0_values_nightly),
-                                                                   h0_values),
-                                                                   i_values_rhs),
-                                                                   t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                   a_values),
-                                                                   beta_values),
-                                                                   act_values))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
-TEST_SUITE(ExportToCLImage)
 FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_values,
@@ -501,7 +348,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<
                                                                    h0_values),
                                                                    i_values_rhs),
                                                                    t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", {false, true})),
                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                    a_values),
                                                                    beta_values),
@@ -509,7 +356,7 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<
                                                                    act_values))
 {
     // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
-    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    if(validate_result)
     {
         validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
     }
@@ -532,7 +379,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<fl
                                                                    h0_values),
                                                                    i_values_rhs),
                                                                    t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", {false, true})),
                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                    a_values),
                                                                    beta_values),
@@ -540,7 +387,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<fl
                                                                    act_values))
 {
     // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
-    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    if(validate_result)
     {
         validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
     }
@@ -552,7 +399,7 @@ FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<fl
 }
 
 FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<float>, framework::DatasetMode::PRECOMMIT,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_w_values,
                                                                    m_h_values),
                                                                    n_values),
@@ -564,18 +411,27 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixt
                                                                    h0_values),
                                                                    i_values_rhs),
                                                                    t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", {false, true})),
+                                                                   framework::dataset::make("has_pad_y", {false, true})),
                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                    a_values),
                                                                    beta_values),
                                                                    act_values))
 {
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<float>, framework::DatasetMode::NIGHTLY,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_w_values,
                                                                    m_h_values),
                                                                    n_values),
@@ -587,16 +443,24 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur
                                                                    h0_values),
                                                                    i_values_rhs),
                                                                    t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", {false, true})),
+                                                                   framework::dataset::make("has_pad_y", {false, true})),
                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                    a_values),
                                                                    beta_values),
                                                                    act_values))
 {
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
-TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
@@ -612,15 +476,23 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<
                                                                    h0_values),
                                                                    i_values_rhs),
                                                                    t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                    a_values),
                                                                    beta_values),
                                                                    broadcast_bias_values),
                                                                    act_values))
 {
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<half>, framework::DatasetMode::NIGHTLY,
@@ -635,19 +507,27 @@ FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<ha
                                                                    h0_values),
                                                                    i_values_rhs),
                                                                    t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                    a_values),
                                                                    beta_values),
                                                                    broadcast_bias_values),
                                                                    act_values))
 {
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<half>, framework::DatasetMode::PRECOMMIT,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_w_values,
                                                                    m_h_values),
                                                                    n_values),
@@ -659,18 +539,27 @@ FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixt
                                                                    h0_values),
                                                                    i_values_rhs),
                                                                    t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("has_pad_y", {false, true})),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                    a_values),
                                                                    beta_values),
                                                                    act_values))
 {
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<half>, framework::DatasetMode::NIGHTLY,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_w_values,
                                                                    m_h_values),
                                                                    n_values),
@@ -682,16 +571,24 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur
                                                                    h0_values),
                                                                    i_values_rhs),
                                                                    t_values_rhs),
-                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("has_pad_y", {false, true})),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                    a_values),
                                                                    beta_values),
                                                                    act_values))
 {
-    // Validate output
-    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
-
 TEST_SUITE_END() // FP16
 
 TEST_SUITE_END() // Float
diff --git a/tests/validation/CL/GEMMReshapeLHSMatrix.cpp b/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
index d9439f63f1..34c37dffde 100644
--- a/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
+++ b/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -82,68 +82,11 @@ const auto i_values = framework::dataset::make("interleave", { true, false });
 /** Transpose values to test */
 const auto t_values = framework::dataset::make("transpose", { true, false });
 
-/** Zero padding test */
-bool validate_zero_padding(unsigned int m_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int k0_value, unsigned int v0_value,
-                            bool i_value_lhs, bool t_value_lhs, bool input_as_3d, DataType dt)
-{
-    const unsigned int M = m_value;
-    const unsigned int K = k_value;
-    const unsigned int B = b_value;
-
-    GEMMLHSMatrixInfo lhs_info;
-    lhs_info.m0 = m0_value;
-    lhs_info.k0 = k0_value;
-    lhs_info.v0 = v0_value;
-    lhs_info.interleave = i_value_lhs;
-    lhs_info.transpose = t_value_lhs;
-
-    const TensorShape lhs_shape(K, M, B);
-    const TensorShape lhs_shape_reshaped = compute_lhs_reshaped_shape(TensorInfo(lhs_shape, 1, dt), lhs_info, input_as_3d);
-
-    // Create tensors
-    CLTensor lhs = create_tensor<CLTensor>(lhs_shape, dt);
-    CLTensor dst = create_tensor<CLTensor>(lhs_shape_reshaped, dt);
-
-    ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Validate zero-padding
-    CLGEMMReshapeLHSMatrixKernel lhs_reshape;
-
-    lhs_reshape.configure(&lhs, &dst, lhs_info, input_as_3d);
-
-    return lhs.info()->padding().empty();
-}
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(GEMMReshapeLHSMatrix)
 
-/** Validate zero padding tests for the LHS input tensor
- *
- * A series of validation tests to test the zero padding requirement
- *
- * Checks performed in order:
- *     - Case where M and K are smaller than M0 and K0
- *     - Generic test case with batch size = 1
- *     - Generic test case with batch size = 4
- *     - Generic test case with input_as_3d_value = true
- */
-DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-framework::dataset::make("M",                   { 1, 23, 63, 101 }),
-framework::dataset::make("K",                   { 1, 47, 29,  27 })),
-framework::dataset::make("B",                   { 1, 1, 4, 7 })),
-framework::dataset::make("M0",                  { 4, 2, 4, 8 })),
-framework::dataset::make("K0",                  { 2, 2, 4, 8 })),
-framework::dataset::make("input_as_3d",         { false, false, false, true })),
-m_value, k_value, b_value, m0_value, k0_value, input_as_3d_value)
-{
-    constexpr DataType dt = DataType::F32;
-
-    bool status = validate_zero_padding(m_value, k_value, b_value, m0_value, k0_value, 2, false, false, input_as_3d_value, dt);
-    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
-}
-
 FIXTURE_DATA_TEST_CASE(S32, CLGEMMReshapeLHSMatrixFixture<int>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape2DShapes(),
                                                                    b_values),
diff --git a/tests/validation/CL/GEMMReshapeRHSMatrix.cpp b/tests/validation/CL/GEMMReshapeRHSMatrix.cpp
index c7b0752cc8..14048e81ec 100644
--- a/tests/validation/CL/GEMMReshapeRHSMatrix.cpp
+++ b/tests/validation/CL/GEMMReshapeRHSMatrix.cpp
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -46,9 +46,6 @@ namespace
 {
 // *INDENT-OFF*
 // clang-format off
-/** Data types */
-const auto data_types = framework::dataset::make("DataType", { DataType::QASYMM8, DataType::F16, DataType::F32 });
-
 /** Batch size values to test */
 const auto b_values = framework::dataset::make("batchsize", 1, 3);
 
@@ -124,19 +121,20 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
     ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS);
 }
 
-DATA_TEST_CASE(ValidatePadding, framework::DatasetMode::ALL, combine(combine(combine(
+DATA_TEST_CASE(ValidatePadding, framework::DatasetMode::ALL, combine(combine(combine(combine(
                framework::dataset::make("InputShape", { TensorShape(32U, 16U, 1U),
                                                         TensorShape(32U, 16U, 2U)
                                                      }),
                 framework::dataset::make("N0",{ 4 })),
                 framework::dataset::make("K0",{ 4, 8, 16 })),
                 framework::dataset::make("H0",{ 1, 2, 4 })),
-               input_shape, n0, k0, h0)
+                framework::dataset::make("DataType",{ DataType::F32, DataType::F16 })),
+               input_shape, n0, k0, h0, data_type)
 {
     CLTensor input;
     CLTensor output;
 
-    input.info()->init(input_shape, 1, DataType::F32);
+    input.info()->init(input_shape, 1, data_type);
 
     unsigned int padding = 0;
 
diff --git a/tests/validation/CL/Gather.cpp b/tests/validation/CL/Gather.cpp
index e7f860e35e..f0b87d7d9f 100644
--- a/tests/validation/CL/Gather.cpp
+++ b/tests/validation/CL/Gather.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLGather.h"
-
 #include "tests/CL/CLAccessor.h"
 #include "tests/datasets/GatherDataset.h"
 #include "tests/framework/Asserts.h"
@@ -98,26 +97,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(arm_compute::test::datasets::SmallGatherDataset(), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })),
-               input_shape, indices_shape, axis, data_type)
-{
-    const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input_shape.num_dimensions()));
-    CLTensor       src         = create_tensor<CLTensor>(input_shape, data_type);
-    CLTensor       indices     = create_tensor<CLTensor>(indices_shape, DataType::U32);
-    TensorShape    dst_shape   = arm_compute::misc::shape_calculator::compute_gather_shape(input_shape, indices_shape, actual_axis);
-    CLTensor       dst         = create_tensor<CLTensor>(dst_shape, data_type);
-
-    // Create and Configure function
-    CLGather gather;
-    gather.configure(&src, &indices, &dst, axis);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using CLGatherFixture = GatherFixture<CLTensor, CLAccessor, CLGather, T>;
 
diff --git a/tests/validation/CL/Gaussian3x3.cpp b/tests/validation/CL/Gaussian3x3.cpp
index 10b1a473c6..840be4b93d 100644
--- a/tests/validation/CL/Gaussian3x3.cpp
+++ b/tests/validation/CL/Gaussian3x3.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,41 +50,6 @@ constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kerne
 TEST_SUITE(CL)
 TEST_SUITE(Gaussian3x3)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLGaussian3x3 gaussian3x3;
-    gaussian3x3.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using CLGaussian3x3Fixture = Gaussian3x3ValidationFixture<CLTensor, CLAccessor, CLGaussian3x3, T>;
 
diff --git a/tests/validation/CL/Gaussian5x5.cpp b/tests/validation/CL/Gaussian5x5.cpp
index a33ac4d4d6..53165c7608 100644
--- a/tests/validation/CL/Gaussian5x5.cpp
+++ b/tests/validation/CL/Gaussian5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,41 +50,6 @@ constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kerne
 TEST_SUITE(CL)
 TEST_SUITE(Gaussian5x5)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLGaussian5x5 gaussian5x5;
-    gaussian5x5.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(2);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-2);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using CLGaussian5x5Fixture = Gaussian5x5ValidationFixture<CLTensor, CLAccessor, CLGaussian5x5, T>;
 
diff --git a/tests/validation/CL/GaussianPyramid.cpp b/tests/validation/CL/GaussianPyramid.cpp
index 4c17cdc88c..4590b0defe 100644
--- a/tests/validation/CL/GaussianPyramid.cpp
+++ b/tests/validation/CL/GaussianPyramid.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -20,7 +20,7 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
-*/
+ */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -69,28 +69,6 @@ inline void validate_gaussian_pyramid(const CLPyramid &target, const std::vector
 TEST_SUITE(CL)
 TEST_SUITE(GaussianPyramid)
 TEST_SUITE(Half)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, large_gaussian_pyramid_levels,
-               shape, border_mode, num_levels)
-{
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U8);
-
-    // Create pyramid
-    PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, shape, Format::U8);
-    CLPyramid   dst;
-    dst.init(pyramid_info);
-
-    CLGaussianPyramidHalf gaussian_pyramid_half;
-    gaussian_pyramid_half.configure(&src, &dst, border_mode, 0);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    for(size_t level = 0; level < pyramid_info.num_levels(); ++level)
-    {
-        ARM_COMPUTE_EXPECT(dst.get_pyramid_level(level)->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-}
-
 template <typename T>
 using CLGaussianPyramidHalfFixture = GaussianPyramidHalfValidationFixture<CLTensor, CLAccessor, CLGaussianPyramidHalf, T, CLPyramid>;
 
diff --git a/tests/validation/CL/GlobalPoolingLayer.cpp b/tests/validation/CL/GlobalPoolingLayer.cpp
index 5328fc8448..246368e66d 100644
--- a/tests/validation/CL/GlobalPoolingLayer.cpp
+++ b/tests/validation/CL/GlobalPoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/HOGDescriptor.cpp b/tests/validation/CL/HOGDescriptor.cpp
index 7c014b5d22..c6b2763dfd 100644
--- a/tests/validation/CL/HOGDescriptor.cpp
+++ b/tests/validation/CL/HOGDescriptor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/HOGDetector.cpp b/tests/validation/CL/HOGDetector.cpp
index 78edf0fd27..9f74c728cf 100644
--- a/tests/validation/CL/HOGDetector.cpp
+++ b/tests/validation/CL/HOGDetector.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/HOGMultiDetection.cpp b/tests/validation/CL/HOGMultiDetection.cpp
index 091ff9e9db..5557fde33c 100644
--- a/tests/validation/CL/HOGMultiDetection.cpp
+++ b/tests/validation/CL/HOGMultiDetection.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/HarrisCorners.cpp b/tests/validation/CL/HarrisCorners.cpp
index 51591bbbde..20a13af53b 100644
--- a/tests/validation/CL/HarrisCorners.cpp
+++ b/tests/validation/CL/HarrisCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,47 +53,6 @@ const auto data_precommit = combine(framework::dataset::make("GradientSize", { 3
 TEST_SUITE(CL)
 TEST_SUITE(HarrisCorners)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), data_nightly), framework::dataset::make("Format", Format::U8)), shape,
-               gradient_size, block_size, border_mode, format)
-{
-    std::mt19937                          gen(library->seed());
-    std::uniform_real_distribution<float> real_dist(0.f, 0.01f);
-
-    const float threshold   = real_dist(gen);
-    const float sensitivity = real_dist(gen);
-
-    constexpr float max_euclidean_distance = 30.f;
-    real_dist                              = std::uniform_real_distribution<float>(0.f, max_euclidean_distance);
-    const float min_dist                   = real_dist(gen);
-
-    // Generate a random constant value
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type_from_format(format));
-    src.info()->set_format(format);
-    CLKeyPointArray corners;
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create harris corners configure function
-    CLHarrisCorners harris_corners;
-    harris_corners.configure(&src, threshold, min_dist, sensitivity, gradient_size, block_size, &corners, border_mode, constant_border_value);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(gradient_size / 2);
-    calculator.set_access_offset(-gradient_size / 2);
-    calculator.set_accessed_elements(16);
-
-    const PaddingSize padding = calculator.required_padding();
-
-    validate(src.info()->padding(), padding);
-}
-
 template <typename T>
 using CLHarrisCornersFixture = HarrisCornersValidationFixture<CLTensor, CLAccessor, CLKeyPointArray, CLHarrisCorners, T>;
 
diff --git a/tests/validation/CL/Histogram.cpp b/tests/validation/CL/Histogram.cpp
index 643e4f55ab..9d25154898 100644
--- a/tests/validation/CL/Histogram.cpp
+++ b/tests/validation/CL/Histogram.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,44 +45,6 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(Histogram)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()),
-                                                                   framework::dataset::make("DataType", DataType::U8)),
-               shape, data_type)
-{
-    // Setup Distribution
-    std::mt19937                            gen(library->seed());
-    std::uniform_int_distribution<size_t>   distribution_size_t(1, 30);
-    const size_t                            num_bins = distribution_size_t(gen);
-    std::uniform_int_distribution<int32_t>  distribution_int32_t(0, 125);
-    const size_t                            offset = distribution_int32_t(gen);
-    std::uniform_int_distribution<uint32_t> distribution_uint32_t(1, 255 - offset);
-    const size_t                            range = distribution_uint32_t(gen);
-    CLDistribution1D                        distribution_dst(num_bins, offset, range);
-
-    // Create tensors
-    CLTensor    src = create_tensor<CLTensor>(shape, data_type);
-    TensorShape dst_shape(num_bins);
-    CLTensor    dst = create_tensor<CLTensor>(dst_shape, DataType::U32);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLHistogram histogram;
-    histogram.configure(&src, &distribution_dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    const ValidRegion valid_region_dst = shape_to_valid_region(dst_shape);
-    validate(dst.info()->valid_region(), valid_region_dst);
-
-    // Validate padding
-    const PaddingSize padding;
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using CLHistogramFixture = HistogramValidationFixture<CLTensor, CLAccessor, CLHistogram, T, CLDistribution1D>;
 
diff --git a/tests/validation/CL/Im2Col.cpp b/tests/validation/CL/Im2Col.cpp
index 12b082fe13..a31aec4d0c 100644
--- a/tests/validation/CL/Im2Col.cpp
+++ b/tests/validation/CL/Im2Col.cpp
@@ -21,9 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
 #include "arm_compute/core/Types.h"
-
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/framework/Asserts.h"
@@ -139,45 +138,6 @@ using CLIm2ColFixture = Im2ColValidationFixture<CLTensor, CLAccessor, CLIm2Col,
 
 TEST_SUITE(NHWC)
 
-/** Test that there's no padding added to input or output as part of configure
- *
- * @note 2 elements processed per iteration
- *
- * Three tests will be run:
- *  - Channels are multiple of elements processed
- *  - Channels larger and non multiple of elements used
- *  - Channels smaller and not multiple of elements used
- *
- */
-DATA_TEST_CASE(ValidateZeroPaddingNumElemsPerIterEqual2, framework::DatasetMode::ALL,
-               combine(combine(combine(combine(combine(
-                                                   framework::dataset::make("InputChannel",
-{
-    2, 9, 1,
-}),
-framework::dataset::make("DataType", { DataType::F32 })),
-framework::dataset::make("Kernel", { Size2D(3, 4) })),
-framework::dataset::make("PadStride", { PadStrideInfo(2, 1, 1, 2) })),
-framework::dataset::make("QInfo", { QuantizationInfo() })),
-framework::dataset::make("DataLayout", { DataLayout::NHWC })),
-input_channel, data_type, conv_size, pad_stride_info, qinfo, data_layout)
-{
-    TensorShape input_shape(input_channel, 10U, 30U, 3U);
-    const bool  has_bias = false;
-
-    const auto input_info   = TensorInfo(input_shape, 1, data_type, data_layout);
-    const auto output_shape = compute_im2col_conv_shape(&input_info, conv_size, pad_stride_info, has_bias, Size2D(1U, 1U), true);
-
-    CLTensor input  = create_tensor<CLTensor>(input_shape, data_type, 1, qinfo, data_layout);
-    CLTensor output = create_tensor<CLTensor>(output_shape, data_type, 1, qinfo, data_layout);
-
-    CLIm2ColKernel im2col;
-    im2col.configure(&input, &output, conv_size, pad_stride_info, has_bias);
-
-    // Ensure there're no paddings added at all
-    const bool no_padding = input.info()->padding().empty() && output.info()->padding().empty();
-    ARM_COMPUTE_EXPECT(no_padding, framework::LogLevel::ERRORS);
-}
 /** Test special kernel used for NHWC for 3x3 kernels
  *
  * @note 2 elements processed per iteration
diff --git a/tests/validation/CL/InstanceNormalizationLayer.cpp b/tests/validation/CL/InstanceNormalizationLayer.cpp
index a30e3260c6..a52ebc5bfe 100644
--- a/tests/validation/CL/InstanceNormalizationLayer.cpp
+++ b/tests/validation/CL/InstanceNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/IntegralImage.cpp b/tests/validation/CL/IntegralImage.cpp
index 74c5a4aa95..700628cbfc 100644
--- a/tests/validation/CL/IntegralImage.cpp
+++ b/tests/validation/CL/IntegralImage.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,30 +37,6 @@ namespace validation
 {
 TEST_SUITE(CL)
 TEST_SUITE(IntegralImage)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, DataType::U32);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLIntegralImage integral_image;
-    integral_image.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using CLIntegralImageFixture = IntegralImageValidationFixture<CLTensor, CLAccessor, CLIntegralImage, T>;
 
diff --git a/tests/validation/CL/L2NormalizeLayer.cpp b/tests/validation/CL/L2NormalizeLayer.cpp
index 9502df5ade..bcf68a526c 100644
--- a/tests/validation/CL/L2NormalizeLayer.cpp
+++ b/tests/validation/CL/L2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/LSTMLayerQuantized.cpp b/tests/validation/CL/LSTMLayerQuantized.cpp
index f975bfb196..fe533ee914 100644
--- a/tests/validation/CL/LSTMLayerQuantized.cpp
+++ b/tests/validation/CL/LSTMLayerQuantized.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
-
 #include "tests/CL/CLAccessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/Utils.h"
diff --git a/tests/validation/CL/LaplacianPyramid.cpp b/tests/validation/CL/LaplacianPyramid.cpp
index 1307f78526..78f3f2373b 100644
--- a/tests/validation/CL/LaplacianPyramid.cpp
+++ b/tests/validation/CL/LaplacianPyramid.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -20,7 +20,7 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
-*/
+ */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLPyramid.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
@@ -43,8 +43,12 @@ namespace validation
 {
 namespace
 {
-const auto small_laplacian_pyramid_levels = framework::dataset::make("NumLevels", 2, 3);
-const auto large_laplacian_pyramid_levels = framework::dataset::make("NumLevels", 2, 5);
+/* Absolute tolerance value for comparing reference's output against implementation's output for DataType::S16
+ * Tolerance is needed for calculation uncertainties introduced from the layers
+ */
+AbsoluteTolerance<int16_t> tolerance_int16(1);
+const auto                 small_laplacian_pyramid_levels = framework::dataset::make("NumLevels", 2, 3);
+const auto                 large_laplacian_pyramid_levels = framework::dataset::make("NumLevels", 2, 5);
 
 const auto formats = combine(framework::dataset::make("FormatIn", Format::U8), framework::dataset::make("FormatOut", Format::S16));
 
@@ -68,7 +72,7 @@ inline void validate_laplacian_pyramid(const CLPyramid &target, const std::vecto
                                                                border_mode == BorderMode::UNDEFINED);
 
         // Validate level
-        validate(CLAccessor(*level_image), reference[lev], valid_region);
+        validate(CLAccessor(*level_image), reference[lev], valid_region, tolerance_int16);
     }
 }
 } // namespace
@@ -78,39 +82,6 @@ TEST_SUITE(LaplacianPyramid)
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(
-                                                           concat(datasets::Medium2DShapes(), datasets::Large2DShapes()),
-                                                           datasets::BorderModes()),
-                                                           large_laplacian_pyramid_levels),
-                                                           shape, border_mode, num_levels)
-{
-    // Create pyramid info
-    PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, shape, Format::S16);
-    CLPyramid   dst_pyramid{};
-    dst_pyramid.init(pyramid_info);
-
-    // Create Tensors
-    CLTensor src = create_tensor<CLTensor>(shape, Format::U8);
-
-    // The first two dimensions of the output tensor must match the first two
-    // dimensions of the tensor in the last level of the pyramid
-    TensorShape dst_shape(shape);
-    dst_shape.set(0, dst_pyramid.get_pyramid_level(num_levels - 1)->info()->dimension(0));
-    dst_shape.set(1, dst_pyramid.get_pyramid_level(num_levels - 1)->info()->dimension(1));
-    CLTensor dst = create_tensor<CLTensor>(dst_shape, Format::S16);
-
-    // Create and configure function
-    CLLaplacianPyramid laplacian_pyramid;
-    laplacian_pyramid.configure(&src, &dst_pyramid, &dst, border_mode, 0);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    for(size_t level = 0; level < pyramid_info.num_levels(); ++level)
-    {
-        ARM_COMPUTE_EXPECT(dst_pyramid.get_pyramid_level(level)->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-}
 
 using CLLaplacianPyramidFixture = LaplacianPyramidValidationFixture<CLTensor, CLAccessor, CLLaplacianPyramid, uint8_t, int16_t, CLPyramid>;
 
diff --git a/tests/validation/CL/LaplacianReconstruct.cpp b/tests/validation/CL/LaplacianReconstruct.cpp
index c2e1fab5b1..5aba380a5a 100644
--- a/tests/validation/CL/LaplacianReconstruct.cpp
+++ b/tests/validation/CL/LaplacianReconstruct.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -20,7 +20,7 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
-*/
+ */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLPyramid.h"
 #include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
@@ -68,48 +68,6 @@ TEST_SUITE(LaplacianReconstruct)
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(
-                                                           concat(datasets::Medium2DShapes(), datasets::Large2DShapes()),
-                                                           datasets::BorderModes()),
-                                                           large_laplacian_reconstruct_levels),
-                                                           shape, border_mode, num_levels)
-{
-    // Create pyramid info
-    PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, shape, Format::S16);
-    CLPyramid   dst_pyramid{};
-    dst_pyramid.init(pyramid_info);
-
-    // Create Tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U8);
-
-    // The first two dimensions of the output tensor must match the first two
-    // dimensions of the tensor in the last level of the pyramid
-    TensorShape dst_shape(shape);
-    dst_shape.set(0, dst_pyramid.get_pyramid_level(num_levels - 1)->info()->dimension(0));
-    dst_shape.set(1, dst_pyramid.get_pyramid_level(num_levels - 1)->info()->dimension(1));
-    CLTensor dst = create_tensor<CLTensor>(dst_shape, DataType::S16);
-
-    // The dimensions of the reconstruct are the same as the src shape
-    CLTensor rec_dst = create_tensor<CLTensor>(shape, DataType::U8);
-
-    // Create and configure pyramid function
-    CLLaplacianPyramid laplacian_pyramid;
-    laplacian_pyramid.configure(&src, &dst_pyramid, &dst, border_mode, 0);
-
-    // Create and configure reconstruct function
-    CLLaplacianReconstruct laplacian_reconstruct;
-    laplacian_reconstruct.configure(&dst_pyramid, &dst, &rec_dst, border_mode, 0);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    for(size_t level = 0; level < pyramid_info.num_levels(); ++level)
-    {
-        ARM_COMPUTE_EXPECT(dst_pyramid.get_pyramid_level(level)->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-
-    ARM_COMPUTE_EXPECT(rec_dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-}
 
 using CLLaplacianReconstructFixture = LaplacianReconstructValidationFixture<CLTensor, CLAccessor, CLLaplacianReconstruct, CLLaplacianPyramid, int16_t, uint8_t, CLPyramid>;
 
diff --git a/tests/validation/CL/LocallyConnected.cpp b/tests/validation/CL/LocallyConnected.cpp
deleted file mode 100644
index d32487b99b..0000000000
--- a/tests/validation/CL/LocallyConnected.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTensorAllocator.h"
-#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/LocallyConnectedDataset.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/LocallyConnectedFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr AbsoluteTolerance<float> atolerance_f32(0.00001f); /**< Absolute Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
-RelativeTolerance<float>           rtolerance_f32(0.05f);    /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
-} // namespace
-
-TEST_SUITE(CL)
-TEST_SUITE(LocallyConnected)
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputInfo",  { TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching data type input/weights
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching data type input/bias
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching data type input/output
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/weights
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/bias
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/output
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Asymmetric padding
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Padding required
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32)
-                                           }),
-    framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F16),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 274U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(1U, 3U, 5U, 21U, 575U), 1, DataType::F32)
-                                           })),
-    framework::dataset::make("BiasInfo",   { TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F16),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 274U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 575U), 1, DataType::F32)
-                                           })),
-    framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F16),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 22U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(23U, 25U, 21U), 1, DataType::F32)
-                                           })),
-    framework::dataset::make("PadStride",  { PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 1, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(1, 1, 0, 0)
-                                           })),
-    framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, true })),
-    input_info, weights_info, bias_info, output_info, conv_info, expected)
-{
-    bool is_valid = bool(CLLocallyConnectedLayer::validate(&input_info.clone()->set_is_resizable(false),
-                                                           &weights_info.clone()->set_is_resizable(false),
-                                                           &bias_info.clone()->set_is_resizable(false),
-                                                           &output_info.clone()->set_is_resizable(false),
-                                                           conv_info));
-    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallLocallyConnectedDataset(),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-               src_shape, weights_shape, bias_shape, dst_shape, info, dilation, data_type)
-{
-    ARM_COMPUTE_UNUSED(dilation);
-
-    // Create tensors
-    CLTensor src     = create_tensor<CLTensor>(src_shape, data_type);
-    CLTensor weights = create_tensor<CLTensor>(weights_shape, data_type);
-    CLTensor bias    = create_tensor<CLTensor>(bias_shape, data_type);
-    CLTensor dst     = create_tensor<CLTensor>(dst_shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function.
-    CLLocallyConnectedLayer lc;
-    lc.configure(&src, &weights, &bias, &dst, info);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(dst_shape);
-    validate(dst.info()->valid_region(), dst_valid_region);
-}
-
-template <typename T>
-using CLLocallyConnectedFixture = LocallyConnectedValidationFixture<CLTensor, CLAccessor, CLLocallyConnectedLayer, T>;
-FIXTURE_DATA_TEST_CASE(RunSmall, CLLocallyConnectedFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallLocallyConnectedDataset(),
-                                                                                                              framework::dataset::make("DataType",
-                                                                                                                      DataType::F32)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rtolerance_f32, 0.f, atolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, CLLocallyConnectedFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeLocallyConnectedDataset(),
-                                                                                                            framework::dataset::make("DataType",
-                                                                                                                    DataType::F32)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, rtolerance_f32, 0.f, atolerance_f32);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/CL/LogSoftmaxLayer.cpp b/tests/validation/CL/LogSoftmaxLayer.cpp
index 15466affc4..b7f6a66e42 100644
--- a/tests/validation/CL/LogSoftmaxLayer.cpp
+++ b/tests/validation/CL/LogSoftmaxLayer.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -59,7 +58,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                      framework::dataset::make("Axis", { 0 })))
+                                                                                                      framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -75,7 +74,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLLogSoftmaxLayerFixture<half>, framework::Data
 FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 0 })))
+                                                                                                       framework::dataset::make("Axis", { 0, -3, 2 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -86,7 +85,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 0 })))
+                                                                                                       framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -99,10 +98,10 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLLogSoftmaxLayerFixture<float>, framework::Dat
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                        framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
+                                                                                                            framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                    framework::dataset::make("Axis", { 0, -4, 3 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
diff --git a/tests/validation/CL/Logical.cpp b/tests/validation/CL/Logical.cpp
new file mode 100644
index 0000000000..ecdb7c8f53
--- /dev/null
+++ b/tests/validation/CL/Logical.cpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
+#include "arm_compute/runtime/CL/functions/CLLogicalNot.h"
+#include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/LogicalFixture.h"
+
+namespace
+{
+using namespace arm_compute;
+
+const auto correct_shape = TensorShape(1, 2, 3, 4); // target shape to check against
+const auto wrong_shape   = TensorShape(1, 2, 2, 4); // wrong shape to check validate logic
+const auto correct_dt    = DataType::U8;            // correct data type to check against
+const auto wrong_dt      = DataType::F32;           // wrong data type to check validate logic
+}
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(LogicalOr)
+TEST_SUITE(Validate)
+TEST_CASE(NullPtr, framework::DatasetMode::ALL)
+{
+    Status s = CLLogicalOr::validate(nullptr, nullptr, nullptr);
+    ARM_COMPUTE_EXPECT((bool)s == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(WrongDataType, framework::DatasetMode::ALL)
+{
+    TensorInfo in1{ correct_shape, 1, correct_dt };
+    TensorInfo in2{ correct_shape, 1, wrong_dt };
+    TensorInfo out{ correct_shape, 1, correct_dt };
+
+    Status s = CLLogicalOr::validate(&in1, &in2, &out);
+    ARM_COMPUTE_EXPECT((bool)s == false, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // Validate
+template <typename T>
+using CLLogicalOrFixture = LogicalOrValidationFixture<CLTensor, CLAccessor, CLLogicalOr, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLLogicalOrFixture<uint8_t>, framework::DatasetMode::ALL, zip(datasets::SmallShapes(), datasets::SmallShapes()))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLLogicalOrFixture<uint8_t>, framework::DatasetMode::ALL, datasets::SmallShapesBroadcast())
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // LogicalOr
+
+TEST_SUITE(LogicalAnd)
+TEST_SUITE(Validate)
+TEST_CASE(NullPtr, framework::DatasetMode::ALL)
+{
+    Status s = CLLogicalAnd::validate(nullptr, nullptr, nullptr);
+    ARM_COMPUTE_EXPECT((bool)s == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(WrongDataType, framework::DatasetMode::ALL)
+{
+    TensorInfo in1{ correct_shape, 1, correct_dt };
+    TensorInfo in2{ correct_shape, 1, wrong_dt };
+    TensorInfo out{ correct_shape, 1, correct_dt };
+
+    Status s = CLLogicalAnd::validate(&in1, &in2, &out);
+    ARM_COMPUTE_EXPECT((bool)s == false, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // Validate
+template <typename T>
+using CLLogicalAndFixture = LogicalAndValidationFixture<CLTensor, CLAccessor, CLLogicalAnd, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLLogicalAndFixture<uint8_t>, framework::DatasetMode::ALL, zip(datasets::SmallShapes(), datasets::SmallShapes()))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLLogicalAndFixture<uint8_t>, framework::DatasetMode::ALL, datasets::SmallShapesBroadcast())
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // LogicalAnd
+TEST_SUITE(LogicalNot)
+
+TEST_SUITE(Validate)
+TEST_CASE(NullPtr, framework::DatasetMode::ALL)
+{
+    Status s = CLLogicalNot::validate(nullptr, nullptr);
+    ARM_COMPUTE_EXPECT((bool)s == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(WrongDataType, framework::DatasetMode::ALL)
+{
+    TensorInfo in{ correct_shape, 1, correct_dt };
+    TensorInfo out{ correct_shape, 1, wrong_dt };
+
+    Status s = CLLogicalNot::validate(&in, &out);
+    ARM_COMPUTE_EXPECT((bool)s == false, framework::LogLevel::ERRORS);
+
+    in  = TensorInfo{ correct_shape, 1, wrong_dt };
+    out = TensorInfo{ correct_shape, 1, correct_dt };
+
+    s = CLLogicalNot::validate(&in, &out);
+    ARM_COMPUTE_EXPECT((bool)s == false, framework::LogLevel::ERRORS);
+
+    in  = TensorInfo{ correct_shape, 1, wrong_dt };
+    out = TensorInfo{ correct_shape, 1, wrong_dt };
+
+    s = CLLogicalNot::validate(&in, &out);
+    ARM_COMPUTE_EXPECT((bool)s == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(WrongShape, framework::DatasetMode::ALL)
+{
+    TensorInfo in{ correct_shape, 1, correct_dt };
+    TensorInfo out{ wrong_shape, 1, correct_dt };
+
+    Status s = CLLogicalNot::validate(&in, &out);
+    ARM_COMPUTE_EXPECT((bool)s == false, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // Validate
+
+template <typename T>
+using CLLogicalNotFixture = LogicalNotValidationFixture<CLTensor, CLAccessor, CLLogicalNot, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLLogicalNotFixture<uint8_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                    DataType::U8)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // LogicalNot
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CL/Magnitude.cpp b/tests/validation/CL/Magnitude.cpp
index 82bce34d84..f5133e8ce2 100644
--- a/tests/validation/CL/Magnitude.cpp
+++ b/tests/validation/CL/Magnitude.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,39 +49,11 @@ AbsoluteTolerance<T> tolerance(MagnitudeType magnitude_type)
 TEST_SUITE(CL)
 TEST_SUITE(Magnitude)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::S16, DataType::S32 })),
-               shape, data_type)
-{
-    // Create tensors
-    CLTensor src1 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor src2 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst  = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function (default MagnitudeType::L2NORM)
-    CLMagnitude magnitude;
-    magnitude.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using CLMagnitudeFixture = MagnitudeValidationFixture<CLTensor, CLAccessor, CLMagnitude, T>;
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMagnitudeFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::Small2DShapes(), framework::dataset::make("Format", Format::S16)),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMagnitudeFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("Format", Format::S16)),
                                                                                                          framework::dataset::make("MagnitudeType", { MagnitudeType::L1NORM, MagnitudeType::L2NORM })))
 {
     // Validate output
@@ -97,7 +69,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLMagnitudeFixture<int16_t>, framework::Dataset
 TEST_SUITE_END() // S16
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLMagnitudeFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::Small2DShapes(), framework::dataset::make("Format", Format::S32)),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLMagnitudeFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("Format", Format::S32)),
                                                                                                          framework::dataset::make("MagnitudeType", { MagnitudeType::L1NORM, MagnitudeType::L2NORM })))
 {
     // Validate output
diff --git a/tests/validation/CL/MeanStdDev.cpp b/tests/validation/CL/MeanStdDev.cpp
index d69d8c2801..dd59193707 100644
--- a/tests/validation/CL/MeanStdDev.cpp
+++ b/tests/validation/CL/MeanStdDev.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,31 +63,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", { DataType::U8 })), shape,
-               data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-
-    // Create output variables
-    float mean    = 0.f;
-    float std_dev = 0.f;
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create configure function
-    CLMeanStdDev mean_std_dev_image;
-    mean_std_dev_image.configure(&src, &mean, &std_dev);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 8).required_padding();
-    validate(src.info()->padding(), padding);
-}
-
 template <typename T>
 using CLMeanStdDevFixture = MeanStdDevValidationFixture<CLTensor, CLAccessor, CLMeanStdDev, T>;
 
diff --git a/tests/validation/CL/MeanStdDevNormalizationLayer.cpp b/tests/validation/CL/MeanStdDevNormalizationLayer.cpp
index a355f9eb1c..e77a21ed7f 100644
--- a/tests/validation/CL/MeanStdDevNormalizationLayer.cpp
+++ b/tests/validation/CL/MeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/Median3x3.cpp b/tests/validation/CL/Median3x3.cpp
index b61e7c0f3f..0a8936f3c0 100644
--- a/tests/validation/CL/Median3x3.cpp
+++ b/tests/validation/CL/Median3x3.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,42 +49,6 @@ constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kerne
 
 TEST_SUITE(CL)
 TEST_SUITE(Median3x3)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLMedian3x3 median3x3;
-    median3x3.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using CLMedian3x3Fixture = Median3x3ValidationFixture<CLTensor, CLAccessor, CLMedian3x3, T>;
 
diff --git a/tests/validation/CL/MinMaxLocation.cpp b/tests/validation/CL/MinMaxLocation.cpp
index e4a17187c4..0131cd8cf1 100644
--- a/tests/validation/CL/MinMaxLocation.cpp
+++ b/tests/validation/CL/MinMaxLocation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,35 +42,7 @@ TEST_SUITE(MinMaxLocation)
 template <typename T>
 using CLMinMaxLocationFixture = MinMaxLocationValidationFixture<CLTensor, CLAccessor, CLArray<Coordinates2D>, CLArrayAccessor<Coordinates2D>, CLMinMaxLocation, T>;
 
-void validate_configuration(const CLTensor &src, TensorShape shape)
-{
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create output storage
-    int32_t              min = 0;
-    int32_t              max = 0;
-    CLCoordinates2DArray min_loc(shape.total_size());
-    CLCoordinates2DArray max_loc(shape.total_size());
-
-    // Create and configure function
-    CLMinMaxLocation min_max_loc;
-    min_max_loc.configure(&src, &min, &max, &min_loc, &max_loc);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-}
-
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    src.info()->set_format(Format::U8);
-
-    validate_configuration(src, shape);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLMinMaxLocationFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
                                                                                                               DataType::U8)))
 {
@@ -86,15 +58,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLMinMaxLocationFixture<uint8_t>, framework::Da
 TEST_SUITE_END() // U8
 
 TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::S16)), shape, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    src.info()->set_format(Format::S16);
-
-    validate_configuration(src, shape);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLMinMaxLocationFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
                                                                                                               DataType::S16)))
 {
@@ -110,15 +73,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLMinMaxLocationFixture<int16_t>, framework::Da
 TEST_SUITE_END() // S16
 
 TEST_SUITE(Float)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    src.info()->set_format(Format::F32);
-
-    validate_configuration(src, shape);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLMinMaxLocationFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
                                                                                                             DataType::F32)))
 {
diff --git a/tests/validation/CL/NonLinearFilter.cpp b/tests/validation/CL/NonLinearFilter.cpp
index 325849b45e..08b6e21497 100644
--- a/tests/validation/CL/NonLinearFilter.cpp
+++ b/tests/validation/CL/NonLinearFilter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,52 +41,6 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(NonLinearFilter)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), datasets::NonLinearFilterFunctions()),
-                                                                                   framework::dataset::make("MaskSize", { 3U, 5U })),
-                                                                           datasets::MatrixPatterns()),
-                                                                   datasets::BorderModes()),
-               shape, function, mask_size, pattern, border_mode)
-{
-    std::mt19937                           generator(library->seed());
-    std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-    const uint8_t                          constant_border_value = distribution_u8(generator);
-
-    // Create the mask
-    std::vector<uint8_t> mask(mask_size * mask_size);
-    fill_mask_from_pattern(mask.data(), mask_size, mask_size, pattern);
-    const auto half_mask_size = static_cast<int>(mask_size / 2);
-
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor dst = create_tensor<CLTensor>(shape, DataType::U8);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLNonLinearFilter filter;
-    filter.configure(&src, &dst, function, mask_size, pattern, mask.data(), border_mode, constant_border_value);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, BorderSize(half_mask_size));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), ((MatrixPattern::OTHER == pattern) ? 1 : 8));
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(half_mask_size);
-
-    const PaddingSize write_padding = calculator.required_padding(PaddingCalculator::Option::EXCLUDE_BORDER);
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-half_mask_size);
-
-    const PaddingSize read_padding = calculator.required_padding(PaddingCalculator::Option::INCLUDE_BORDER);
-
-    validate(src.info()->padding(), read_padding);
-    validate(dst.info()->padding(), write_padding);
-}
-
 template <typename T>
 using CLNonLinearFilterFixture = NonLinearFilterValidationFixture<CLTensor, CLAccessor, CLNonLinearFilter, T>;
 
diff --git a/tests/validation/CL/NormalizationLayer.cpp b/tests/validation/CL/NormalizationLayer.cpp
index 88949806d5..1aed2786ff 100644
--- a/tests/validation/CL/NormalizationLayer.cpp
+++ b/tests/validation/CL/NormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/NormalizePlanarYUVLayer.cpp b/tests/validation/CL/NormalizePlanarYUVLayer.cpp
index 58c3b82cd0..1e410a9c6f 100644
--- a/tests/validation/CL/NormalizePlanarYUVLayer.cpp
+++ b/tests/validation/CL/NormalizePlanarYUVLayer.cpp
@@ -54,31 +54,6 @@ TEST_SUITE(NormalizePlanarYUVLayer)
 template <typename T>
 using CLNormalizePlanarYUVLayerFixture = NormalizePlanarYUVLayerValidationFixture<CLTensor, CLAccessor, CLNormalizePlanarYUVLayer, T>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::RandomNormalizePlanarYUVLayerDataset(), framework::dataset::make("DataType", { DataType::F16 })),
-                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-               shape0, shape1, dt, data_layout)
-{
-    TensorShape src_dst_shapes = shape0;
-    if(data_layout == DataLayout::NHWC)
-    {
-        permute(src_dst_shapes, PermutationVector(2U, 0U, 1U));
-    }
-
-    // Create tensors
-    CLTensor src  = create_tensor<CLTensor>(src_dst_shapes, dt, 1, QuantizationInfo(), data_layout);
-    CLTensor dst  = create_tensor<CLTensor>(src_dst_shapes, dt, 1, QuantizationInfo(), data_layout);
-    CLTensor mean = create_tensor<CLTensor>(shape1, dt, 1);
-    CLTensor sd   = create_tensor<CLTensor>(shape1, dt, 1);
-
-    // Create and Configure function
-    CLNormalizePlanarYUVLayer norm;
-    norm.configure(&src, &dst, &mean, &sd);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(src_dst_shapes);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
diff --git a/tests/validation/CL/OpticalFlow.cpp b/tests/validation/CL/OpticalFlow.cpp
index cf60038d4b..7c1ff5ed57 100644
--- a/tests/validation/CL/OpticalFlow.cpp
+++ b/tests/validation/CL/OpticalFlow.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/PReluLayer.cpp b/tests/validation/CL/PReluLayer.cpp
index 832bac21e2..82436a9671 100644
--- a/tests/validation/CL/PReluLayer.cpp
+++ b/tests/validation/CL/PReluLayer.cpp
@@ -46,7 +46,6 @@ namespace
 RelativeTolerance<float> tolerance_fp32(0.000001f);
 RelativeTolerance<float> tolerance_fp16(0.001f);
 
-constexpr unsigned int num_elems_processed_per_iteration = 16;
 /** Input data sets **/
 const auto PReluLayerU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)),
                                          framework::dataset::make("DataType",
@@ -73,23 +72,20 @@ TEST_SUITE(PReluLayer)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),      // Window shrink
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, true, false, false, false})),
+               framework::dataset::make("Expected", { true, true, false, false})),
                input1_info, input2_info, output_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLPReluLayer::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
@@ -101,29 +97,6 @@ template <typename T>
 using CLPReluLayerFixture = PReluLayerValidationFixture<CLTensor, CLAccessor, CLPReluLayer, T>;
 
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::U8);
-
-    // Create and Configure function
-    CLPReluLayer prelu;
-    prelu.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPReluLayerFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), PReluLayerU8Dataset))
 {
     // Validate output
@@ -136,29 +109,6 @@ using CLPReluLayerQuantizedFixture = PReluLayerValidationQuantizedFixture<CLTens
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-
-    // Create and Configure function
-    CLPReluLayer prelu;
-    prelu.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPReluLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                    PReluLayerQASYMM8Dataset),
                                                                                                                    framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
@@ -188,29 +138,6 @@ TEST_SUITE_END()
 TEST_SUITE_END()
 
 TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-               shape, data_type)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::S16);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::S16);
-
-    // Create and Configure function
-    CLPReluLayer prelu;
-    prelu.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPReluLayerFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), PReluLayerS16Dataset))
 {
     // Validate output
@@ -228,28 +155,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPReluLayerFixture<half>, framework::DatasetMo
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(), shape)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    CLPReluLayer prelu;
-    prelu.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPReluLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), PReluLayerFP32Dataset))
 {
     // Validate output
diff --git a/tests/validation/CL/Permute.cpp b/tests/validation/CL/Permute.cpp
index ed5d18b8de..9caae81089 100644
--- a/tests/validation/CL/Permute.cpp
+++ b/tests/validation/CL/Permute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -120,29 +120,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small4DShapes(), framework::dataset::make("DataType", { DataType::S8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32 })),
-               shape, data_type)
-{
-    // Define permutation vector
-    const PermutationVector perm(2U, 0U, 1U);
-
-    // Permute shapes
-    TensorShape output_shape = shape;
-    permute(output_shape, perm);
-
-    // Create tensors
-    CLTensor ref_src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst     = create_tensor<CLTensor>(output_shape, data_type);
-
-    // Create and Configure function
-    CLPermute perm_func;
-    perm_func.configure(&ref_src, &dst, perm);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(output_shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 #ifndef DOXYGEN_SKIP_THIS
 
 template <typename T>
diff --git a/tests/validation/CL/Phase.cpp b/tests/validation/CL/Phase.cpp
index 71ac66951d..137815040a 100644
--- a/tests/validation/CL/Phase.cpp
+++ b/tests/validation/CL/Phase.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,38 +45,11 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_value(1);
 TEST_SUITE(CL)
 TEST_SUITE(Phase)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::S16, DataType::S32 })),
-               shape, data_type)
-{
-    // Create tensors
-    CLTensor src1 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor src2 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst  = create_tensor<CLTensor>(shape, DataType::U8);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLPhase phase;
-    phase.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using CLPhaseFixture = PhaseValidationFixture<CLTensor, CLAccessor, CLPhase, T>;
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPhaseFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::Small2DShapes(), framework::dataset::make("Format", Format::S16)),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPhaseFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("Format", Format::S16)),
                                                                                                      framework::dataset::make("PhaseType", { PhaseType::SIGNED, PhaseType::UNSIGNED })))
 {
     // Validate output
@@ -92,7 +65,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLPhaseFixture<int16_t>, framework::DatasetMode
 TEST_SUITE_END() // S16
 
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPhaseFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::Small2DShapes(), framework::dataset::make("Format", Format::S32)),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPhaseFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("Format", Format::S32)),
                                                                                                      framework::dataset::make("PhaseType", { PhaseType::SIGNED, PhaseType::UNSIGNED })))
 {
     // Validate output
diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index eefad4ab2c..c79775e1e2 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp
@@ -94,17 +94,15 @@ TEST_SUITE(PoolingLayer)
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching data type
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Window shrink
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Invalid pad/size combination
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Invalid pad/size combination
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QASYMM8), // Invalid parameters
                                                        TensorInfo(TensorShape(15U, 13U, 5U), 1, DataType::F32),     // Non-rectangular Global Pooling
                                                        TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32),     // Invalid output Global Pooling
-                                                       TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::QASYMM8), // Invalid exclude_padding = false with quantized type, no actual padding and NHWC
+                                                       TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::QASYMM8),
                                                        TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32),
                                                      }),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(30U, 11U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(25U, 16U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QASYMM8),
@@ -114,7 +112,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                        TensorInfo(TensorShape(1U, 1U, 5U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("PoolInfo",  { PoolingLayerInfo(PoolingType::AVG, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
-                                                       PoolingLayerInfo(PoolingType::AVG, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
                                                        PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NCHW, PadStrideInfo(1, 1, 2, 0)),
                                                        PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 2)),
                                                        PoolingLayerInfo(PoolingType::L2, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
@@ -123,11 +120,12 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                        PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NHWC, PadStrideInfo(), false),
                                                        PoolingLayerInfo(PoolingType::AVG, DataLayout::NCHW),
                                                       })),
-               framework::dataset::make("Expected", { false, false, false, false, false, true, false, false, true })),
+               framework::dataset::make("Expected", { false, false, false, false, true, false, true, true })),
                input_info, output_info, pool_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLPoolingLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pool_info)) == expected, framework::LogLevel::ERRORS);
 }
+
 // clang-format on
 // *INDENT-ON*
 
diff --git a/tests/validation/CL/PriorBoxLayer.cpp b/tests/validation/CL/PriorBoxLayer.cpp
index c63b093844..780f4796fa 100644
--- a/tests/validation/CL/PriorBoxLayer.cpp
+++ b/tests/validation/CL/PriorBoxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/QLSTMLayerNormalization.cpp b/tests/validation/CL/QLSTMLayerNormalization.cpp
index a927be17bb..1c7dee4612 100644
--- a/tests/validation/CL/QLSTMLayerNormalization.cpp
+++ b/tests/validation/CL/QLSTMLayerNormalization.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+#include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
diff --git a/tests/validation/CL/QuantizationLayer.cpp b/tests/validation/CL/QuantizationLayer.cpp
index 0953688554..335d8df293 100644
--- a/tests/validation/CL/QuantizationLayer.cpp
+++ b/tests/validation/CL/QuantizationLayer.cpp
@@ -74,29 +74,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(QuantizationSmallShapes, framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLQuantizationLayer quant_layer;
-    quant_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 template <typename T>
 using CLQuantizationLayerQASYMM8Fixture = QuantizationValidationFixture<CLTensor, CLAccessor, CLQuantizationLayer, T, uint8_t>;
 template <typename T>
diff --git a/tests/validation/CL/RNNLayer.cpp b/tests/validation/CL/RNNLayer.cpp
index 4e67868943..23219bd7b0 100644
--- a/tests/validation/CL/RNNLayer.cpp
+++ b/tests/validation/CL/RNNLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/Range.cpp b/tests/validation/CL/Range.cpp
index bf81f55e41..c4e0e17aa0 100644
--- a/tests/validation/CL/Range.cpp
+++ b/tests/validation/CL/Range.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/ReduceMean.cpp b/tests/validation/CL/ReduceMean.cpp
index cb1e38e3ac..947f84af49 100644
--- a/tests/validation/CL/ReduceMean.cpp
+++ b/tests/validation/CL/ReduceMean.cpp
@@ -25,7 +25,6 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLReduceMean.h"
-
 #include "tests/CL/CLAccessor.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/datasets/SplitDataset.h"
@@ -133,16 +132,33 @@ TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLReduceMeanQuantizedFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 5) })))
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 255, 5) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 255, 5) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE(Requant)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLReduceMeanQuantizedFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), axis_drop),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 255, 5) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 200, 16) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
+TEST_SUITE_END() // Requant
 
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLReduceMeanQuantizedFixture<uint8_t>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 5) })))
+                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 255, 5) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 255, 5) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -153,16 +169,33 @@ TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLReduceMeanQuantizedFixture<int8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 102, 2) })))
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), concat(axis_keep, axis_drop)),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 102, 2) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 102, 2) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
+TEST_SUITE(Requant)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLReduceMeanQuantizedFixture<int8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), axis_drop),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 102, 2) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 113, 10) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
+TEST_SUITE_END() // Requant
 
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        CLReduceMeanQuantizedFixture<int8_t>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 102, 2) })))
+                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), concat(axis_keep, axis_drop)),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 102, 2) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 102, 2) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
diff --git a/tests/validation/CL/Remap.cpp b/tests/validation/CL/Remap.cpp
index d849d6c1a3..802e611eba 100644
--- a/tests/validation/CL/Remap.cpp
+++ b/tests/validation/CL/Remap.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,41 +49,6 @@ constexpr float                      tolerance_number = 0.2f;
 
 TEST_SUITE(CL)
 TEST_SUITE(Remap)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                           framework::dataset::make("DataType", DataType::U8)),
-                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })),
-               shape, policy, data_type, border_mode)
-{
-    CLTensor src   = create_tensor<CLTensor>(shape, data_type);
-    CLTensor map_x = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor map_y = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor dst   = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(map_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(map_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLRemap remap;
-    remap.configure(&src, &map_x, &map_y, &dst, policy, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    const int total_right  = ceil_to_multiple(shape[0], 4);
-    const int access_right = total_right + (((total_right - shape[0]) == 0) ? 1 : 0);
-
-    const PaddingSize read_padding(1, access_right - shape[0], 1, 1);
-    validate(src.info()->padding(), read_padding);
-
-    PaddingCalculator calculator(shape.x(), 4);
-    validate(dst.info()->padding(), calculator.required_padding());
-}
-
 template <typename T>
 using CLRemapFixture = RemapValidationFixture<CLTensor, CLAccessor, CLRemap, T>;
 
diff --git a/tests/validation/CL/ReorgLayer.cpp b/tests/validation/CL/ReorgLayer.cpp
index 339b368487..d09744ca21 100644
--- a/tests/validation/CL/ReorgLayer.cpp
+++ b/tests/validation/CL/ReorgLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,44 +67,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallReorgLayerDataset(),
-                                                                           framework::dataset::make("DataType", { DataType::F32, DataType::F16, DataType::QASYMM8 })),
-                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-               shape, stride, data_type, data_layout)
-{
-    // Permute the tensor shape in case of NHWC data layout
-    TensorShape shape_to_use = shape;
-    if(data_layout == DataLayout::NHWC)
-    {
-        permute(shape_to_use, PermutationVector(2U, 0U, 1U));
-    }
-
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape_to_use, data_type, 1, QuantizationInfo(), data_layout);
-    CLTensor dst;
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLReorgLayer reorg_layer;
-
-    // Auto-initialize the output within the function
-    reorg_layer.configure(&src, &dst, stride);
-
-    // Validate valid region
-    const ValidRegion src_valid_region = shape_to_valid_region(shape_to_use);
-    const ValidRegion dst_valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-    validate(src.info()->valid_region(), src_valid_region);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    const int         step        = 1;
-    const PaddingSize src_padding = PaddingCalculator(shape_to_use.x(), step).required_padding();
-    const PaddingSize dst_padding = PaddingCalculator(dst.info()->tensor_shape().x(), step).required_padding();
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using CLReorgLayerFixture = ReorgLayerValidationFixture<CLTensor, CLAccessor, CLReorgLayer, T>;
 
diff --git a/tests/validation/CL/Reverse.cpp b/tests/validation/CL/Reverse.cpp
index ed2c6e337a..11df0e7803 100644
--- a/tests/validation/CL/Reverse.cpp
+++ b/tests/validation/CL/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/Scharr.cpp b/tests/validation/CL/Scharr.cpp
index fa6b48f68d..541490b01f 100644
--- a/tests/validation/CL/Scharr.cpp
+++ b/tests/validation/CL/Scharr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,57 +44,6 @@ TEST_SUITE(Scharr)
 TEST_SUITE(W3x3)
 using CLScharr3x3Fixture = ScharrValidationFixture<CLTensor, CLAccessor, CLScharr3x3, uint8_t, int16_t>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    CLTensor src   = create_tensor<CLTensor>(shape, data_type_from_format(format));
-    CLTensor dst_x = create_tensor<CLTensor>(shape, DataType::S16);
-    CLTensor dst_y = create_tensor<CLTensor>(shape, DataType::S16);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S16);
-    dst_y.info()->set_format(Format::S16);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create scharr 3x3 configure function
-    CLScharr3x3 scharr;
-    scharr.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 1 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(1);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst_x.info()->padding(), dst_padding);
-    validate(dst_y.info()->padding(), dst_padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLScharr3x3Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
                                                                                                         Format::U8)),
                                                                                                 datasets::GradientDimensions()))
diff --git a/tests/validation/CL/Select.cpp b/tests/validation/CL/Select.cpp
index 13a8cf4930..3d7c61aab5 100644
--- a/tests/validation/CL/Select.cpp
+++ b/tests/validation/CL/Select.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -98,37 +98,6 @@ using CLSelectFixture = SelectValidationFixture<CLTensor, CLAccessor, CLSelect,
 
 TEST_SUITE(Float)
 TEST_SUITE(F16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, run_small_dataset,
-               shape, same_rank)
-{
-    const DataType dt = DataType::F16;
-
-    // Create tensors
-    CLTensor ref_c = create_tensor<CLTensor>(detail::select_condition_shape(shape, same_rank), DataType::U8);
-    CLTensor ref_x = create_tensor<CLTensor>(shape, dt);
-    CLTensor ref_y = create_tensor<CLTensor>(shape, dt);
-    CLTensor dst   = create_tensor<CLTensor>(shape, dt);
-
-    // Create and Configure function
-    CLSelect select;
-    select.configure(&ref_c, &ref_x, &ref_y, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const int         step    = 16 / arm_compute::data_size_from_type(dt);
-    const PaddingSize padding = PaddingCalculator(shape.x(), step).required_padding();
-    if(same_rank)
-    {
-        validate(ref_c.info()->padding(), padding);
-    }
-    validate(ref_x.info()->padding(), padding);
-    validate(ref_y.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLSelectFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
@@ -149,37 +118,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 TEST_SUITE_END() // F16
 
 TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, run_small_dataset,
-               shape, same_rank)
-{
-    const DataType dt = DataType::F32;
-
-    // Create tensors
-    CLTensor ref_c = create_tensor<CLTensor>(detail::select_condition_shape(shape, same_rank), DataType::U8);
-    CLTensor ref_x = create_tensor<CLTensor>(shape, dt);
-    CLTensor ref_y = create_tensor<CLTensor>(shape, dt);
-    CLTensor dst   = create_tensor<CLTensor>(shape, dt);
-
-    // Create and Configure function
-    CLSelect select;
-    select.configure(&ref_c, &ref_x, &ref_y, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const int         step    = 16 / arm_compute::data_size_from_type(dt);
-    const PaddingSize padding = PaddingCalculator(shape.x(), step).required_padding();
-    if(same_rank)
-    {
-        validate(ref_c.info()->padding(), padding);
-    }
-    validate(ref_x.info()->padding(), padding);
-    validate(ref_y.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLSelectFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
@@ -202,37 +140,6 @@ TEST_SUITE_END() // Float
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, run_small_dataset,
-               shape, same_rank)
-{
-    const DataType dt = DataType::QASYMM8;
-
-    // Create tensors
-    CLTensor ref_c = create_tensor<CLTensor>(detail::select_condition_shape(shape, same_rank), DataType::U8);
-    CLTensor ref_x = create_tensor<CLTensor>(shape, dt);
-    CLTensor ref_y = create_tensor<CLTensor>(shape, dt);
-    CLTensor dst   = create_tensor<CLTensor>(shape, dt);
-
-    // Create and Configure function
-    CLSelect select;
-    select.configure(&ref_c, &ref_x, &ref_y, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const int         step    = 16 / arm_compute::data_size_from_type(dt);
-    const PaddingSize padding = PaddingCalculator(shape.x(), step).required_padding();
-    if(same_rank)
-    {
-        validate(ref_c.info()->padding(), padding);
-    }
-    validate(ref_x.info()->padding(), padding);
-    validate(ref_y.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLSelectFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
diff --git a/tests/validation/CL/Slice.cpp b/tests/validation/CL/Slice.cpp
index 50b880e656..919ab9d367 100644
--- a/tests/validation/CL/Slice.cpp
+++ b/tests/validation/CL/Slice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,24 +63,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(arm_compute::test::datasets::SmallSliceDataset(), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })),
-               shape, starts, ends, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst;
-
-    // Create and Configure function
-    CLSlice slice;
-    slice.configure(&src, &dst, starts, ends);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using CLSliceFixture = SliceFixture<CLTensor, CLAccessor, CLSlice, T>;
 
diff --git a/tests/validation/CL/Sobel.cpp b/tests/validation/CL/Sobel.cpp
index 3670003898..725c879c47 100644
--- a/tests/validation/CL/Sobel.cpp
+++ b/tests/validation/CL/Sobel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,57 +49,6 @@ TEST_SUITE(Sobel)
 TEST_SUITE(W3x3)
 using CLSobel3x3Fixture = SobelValidationFixture<CLTensor, CLAccessor, CLSobel3x3, uint8_t, int16_t>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    CLTensor src   = create_tensor<CLTensor>(shape, data_type_from_format(format));
-    CLTensor dst_x = create_tensor<CLTensor>(shape, DataType::S16);
-    CLTensor dst_y = create_tensor<CLTensor>(shape, DataType::S16);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S16);
-    dst_y.info()->set_format(Format::S16);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create sobel 3x3 configure function
-    CLSobel3x3 sobel;
-    sobel.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 1 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(1);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst_x.info()->padding(), dst_padding);
-    validate(dst_y.info()->padding(), dst_padding);
-}
-
 TEST_SUITE(X)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLSobel3x3Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
                                                                                                        Format::U8)),
@@ -170,56 +119,6 @@ TEST_SUITE_END()
 TEST_SUITE(W5x5)
 using CLSobel5x5Fixture = SobelValidationFixture<CLTensor, CLAccessor, CLSobel5x5, uint8_t, int16_t>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    CLTensor src   = create_tensor<CLTensor>(shape, data_type_from_format(format));
-    CLTensor dst_x = create_tensor<CLTensor>(shape, DataType::S16);
-    CLTensor dst_y = create_tensor<CLTensor>(shape, DataType::S16);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S16);
-    dst_y.info()->set_format(Format::S16);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create sobel 5x5 configure function
-    CLSobel5x5 sobel;
-    sobel.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 2 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(2);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-2);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst_x.info()->padding(), dst_padding);
-    validate(dst_y.info()->padding(), dst_padding);
-}
-
 TEST_SUITE(X)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLSobel5x5Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
                                                                                                        Format::U8)),
@@ -288,56 +187,6 @@ TEST_SUITE_END()
 TEST_SUITE(W7x7)
 using CLSobel7x7Fixture = SobelValidationFixture<CLTensor, CLAccessor, CLSobel7x7, uint8_t, int32_t>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    CLTensor src   = create_tensor<CLTensor>(shape, data_type_from_format(format));
-    CLTensor dst_x = create_tensor<CLTensor>(shape, DataType::S32);
-    CLTensor dst_y = create_tensor<CLTensor>(shape, DataType::S32);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S32);
-    dst_y.info()->set_format(Format::S32);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create sobel 7x7 configure function
-    CLSobel7x7 sobel;
-    sobel.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 3 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(3);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-3);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst_x.info()->padding(), dst_padding);
-    validate(dst_y.info()->padding(), dst_padding);
-}
 TEST_SUITE(X)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLSobel7x7Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
                                                                                                        Format::U8)),
diff --git a/tests/validation/CL/SoftmaxLayer.cpp b/tests/validation/CL/SoftmaxLayer.cpp
index 90c3058c5d..396e274e0b 100644
--- a/tests/validation/CL/SoftmaxLayer.cpp
+++ b/tests/validation/CL/SoftmaxLayer.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -63,42 +62,6 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 TEST_SUITE(CL)
 TEST_SUITE(SoftmaxLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SoftmaxLayerSmallShapes(), CNNDataTypes), shape, data_type)
-{
-    const QuantizationInfo quantization_info = is_data_type_quantized_asymmetric(data_type) ? QuantizationInfo(1.f / 255.f, 0) : QuantizationInfo();
-
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type, 1, quantization_info);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type, 1, QuantizationInfo(1.f / 256.f, 0));
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLSoftmaxLayer smx_layer;
-    smx_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // CLLogits1DMaxShiftExpSumKernel configures the paddings only in the 2D case
-    if(shape.num_dimensions() <= 2)
-    {
-        // Get reduction kernel info
-        CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo reduction_info = CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(shape.x());
-
-        // Validate src padding for 2D softmax
-        const PaddingSize padding_src = PaddingCalculator(shape.x(), std::get<1>(reduction_info)).required_padding();
-        validate(src.info()->padding(), padding_src);
-
-        // Validate dst padding for 2D softmax
-        const PaddingSize padding_dst = PaddingCalculator(shape.x(), 16).required_padding();
-        validate(dst.info()->padding(), padding_dst);
-    }
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
@@ -106,8 +69,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
                                                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
                                                                   QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Window shrink
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),// Invalid input dimensionality
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
                                                                   QuantizationInfo(1.f/256, 12)),
@@ -122,16 +83,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
                                                                   QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
                                                                   QuantizationInfo(1.f/256, 0)),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED,
                                                                   QuantizationInfo(1.f/256, -128)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED, // Invalid axis high
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED,
                                                                   QuantizationInfo(1.f/256, -128)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED, // Invalid axis low
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED,
                                                                   QuantizationInfo(1.f/256, -128)),
                                                      })),
                framework::dataset::make("beta", { 1.0,
@@ -142,22 +101,18 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                   2.0,
                                                   1.0,
                                                   2.0,
-                                                  1.0,
-                                                  2.0,
                                                 })),
                framework::dataset::make("axis", {
                                                   0,
                                                   0,
                                                   0,
+                                                  1,
                                                   0,
-                                                  0,
-                                                  0,
-                                                  0,
-                                                  0,
-                                                  2,
                                                   -1,
+                                                  2,
+                                                  -3,
                                                 })),
-               framework::dataset::make("Expected", { false, false, false, false, false, true, true, true, false, false })),
+               framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })),
                input_info, output_info, beta, axis, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLSoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
@@ -173,7 +128,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                                                            framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })))
+                                                                                                   framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -189,7 +144,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<half>, framework::Dataset
 FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F16)),
                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                    framework::dataset::make("Axis", { 0 })))
+                                                                                                    framework::dataset::make("Axis", { 0, -1, 2 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -200,7 +155,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                    framework::dataset::make("Axis", { 0 })))
+                                                                                                    framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -216,7 +171,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<float>, framework::Datase
 FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
                                                                                                                      framework::dataset::make("DataType", DataType::F32)),
                                                                                                              framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                     framework::dataset::make("Axis", { 0 })))
+                                                                                                     framework::dataset::make("Axis", { 0, -2, 3 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -233,7 +188,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerQuantizedFixture<uint8_t>, framew
                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                               framework::dataset::make("Axis", { 0 })))
+                                                                                                               framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -251,7 +206,7 @@ FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerQuantizedFixture<uint8_t>, framework
                                                                                                                         framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                         combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                framework::dataset::make("Axis", { 0 })))
+                                                                                                                framework::dataset::make("Axis", { 0, -4, 1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -265,7 +220,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerQuantizedFixture<int8_t>, framewo
                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                       combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                              framework::dataset::make("Axis", { 0 })))
+                                                                                                              framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed);
diff --git a/tests/validation/CL/SpaceToBatchLayer.cpp b/tests/validation/CL/SpaceToBatchLayer.cpp
index b2339399a3..971312e379 100644
--- a/tests/validation/CL/SpaceToBatchLayer.cpp
+++ b/tests/validation/CL/SpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/SpaceToDepthLayer.cpp b/tests/validation/CL/SpaceToDepthLayer.cpp
index 25b4bcd70c..b9e767fb65 100644
--- a/tests/validation/CL/SpaceToDepthLayer.cpp
+++ b/tests/validation/CL/SpaceToDepthLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/Split.cpp b/tests/validation/CL/Split.cpp
index 99110ffa79..7e6c6f9e92 100644
--- a/tests/validation/CL/Split.cpp
+++ b/tests/validation/CL/Split.cpp
@@ -91,66 +91,6 @@ DATA_TEST_CASE(ValidateSplitShapes, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallSplitDataset(), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })),
-               shape, axis, splits, data_type)
-{
-    // Create tensors
-    CLTensor                 src = create_tensor<CLTensor>(shape, data_type);
-    std::vector<CLTensor>    dsts(splits);
-    std::vector<ICLTensor *> dsts_ptrs;
-    dsts_ptrs.reserve(splits);
-    for(auto &dst : dsts)
-    {
-        dsts_ptrs.emplace_back(&dst);
-    }
-
-    // Create and Configure function
-    CLSplit split;
-    split.configure(&src, dsts_ptrs, axis);
-
-    // Validate valid regions
-    for(auto &dst : dsts)
-    {
-        const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-        validate(dst.info()->valid_region(), valid_region);
-    }
-}
-
-DATA_TEST_CASE(ConfigurationSplitShapes,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallSplitShapesDataset(), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })),
-               shape, axis, split_shapes, data_type)
-{
-    // Create tensors
-    CLTensor              src = create_tensor<CLTensor>(shape, data_type);
-    std::vector<CLTensor> dsts;
-
-    for(const auto &split_shape : split_shapes)
-    {
-        CLTensor dst = create_tensor<CLTensor>(split_shape, data_type);
-        dsts.push_back(std::move(dst));
-    }
-
-    std::vector<ICLTensor *> dsts_ptrs;
-    for(auto &dst : dsts)
-    {
-        dsts_ptrs.emplace_back(&dst);
-    }
-
-    // Create and Configure function
-    CLSplit split;
-    split.configure(&src, dsts_ptrs, axis);
-
-    // Validate valid regions
-    for(auto &dst : dsts)
-    {
-        const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-        validate(dst.info()->valid_region(), valid_region);
-    }
-}
-
 template <typename T>
 using CLSplitFixture = SplitFixture<CLTensor, ICLTensor, CLAccessor, CLSplit, T>;
 
@@ -182,6 +122,18 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
         validate(CLAccessor(_target[i]), _reference[i]);
     }
 }
+
+FIXTURE_DATA_TEST_CASE(RunSmallSplitShapes,
+                       CLSplitShapesFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallSplitShapesDataset(), framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate outputs
+    for(unsigned int i = 0; i < _target.size(); ++i)
+    {
+        validate(CLAccessor(_target[i]), _reference[i]);
+    }
+}
 TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
diff --git a/tests/validation/CL/StridedSlice.cpp b/tests/validation/CL/StridedSlice.cpp
index 9bfad55404..1fcdf3ed29 100644
--- a/tests/validation/CL/StridedSlice.cpp
+++ b/tests/validation/CL/StridedSlice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,24 +65,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(arm_compute::test::datasets::SmallStridedSliceDataset(), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })),
-               shape, starts, ends, strides, begin_mask, end_mask, shrink_mask, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst;
-
-    // Create and Configure function
-    CLStridedSlice strided_slice;
-    strided_slice.configure(&src, &dst, starts, ends, strides, begin_mask, end_mask, shrink_mask);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using CLStridedSliceFixture = StridedSliceFixture<CLTensor, CLAccessor, CLStridedSlice, T>;
 
diff --git a/tests/validation/CL/TableLookup.cpp b/tests/validation/CL/TableLookup.cpp
index b611ef6bd9..f435c60c13 100644
--- a/tests/validation/CL/TableLookup.cpp
+++ b/tests/validation/CL/TableLookup.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,6 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLTableLookup.h"
-
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/CLLutAccessor.h"
 #include "tests/PaddingCalculator.h"
@@ -34,7 +33,6 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
-
 #include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/TableLookupFixture.h"
@@ -51,42 +49,7 @@ TEST_SUITE(TableLookup)
 template <typename T>
 using CLTableLookupFixture = TableLookupValidationFixture<CLTensor, CLAccessor, CLTableLookup, CLLutAccessor<T>, CLLut, T>;
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-               shape, data_type)
-{
-    // Create Lut
-    const int num_elem = (data_type == DataType::U8) ? std::numeric_limits<uint8_t>::max() + 1 : std::numeric_limits<int16_t>::max() - std::numeric_limits<int16_t>::lowest() + 1;
-    CLLut     cllut(num_elem, data_type);
-
-    switch(data_type)
-    {
-        case DataType::U8:
-            fill_lookuptable(CLLutAccessor<uint8_t>(cllut));
-            break;
-        case DataType::S16:
-            fill_lookuptable(CLLutAccessor<int16_t>(cllut));
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
 
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    // Create and Configure function
-    CLTableLookup table_lookup;
-    table_lookup.configure(&src, &cllut, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 8).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
 FIXTURE_DATA_TEST_CASE(RunSmallU8, CLTableLookupFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)))
 {
     // Validate output
diff --git a/tests/validation/CL/Threshold.cpp b/tests/validation/CL/Threshold.cpp
index 215565ea6c..d5346d3d1f 100644
--- a/tests/validation/CL/Threshold.cpp
+++ b/tests/validation/CL/Threshold.cpp
@@ -39,31 +39,6 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(Threshold)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), datasets::MixedThresholdDataset()),
-                                                                   framework::dataset::make("DataType", DataType::U8)),
-               shape, threshold, false_value, true_value, type, upper, data_type)
-{
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLThreshold thrsh;
-    thrsh.configure(&src, &dst, ThresholdKernelInfo(threshold, false_value, true_value, type, upper));
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using CLThresholdFixture = ThresholdValidationFixture<CLTensor, CLAccessor, CLThreshold, T>;
 
diff --git a/tests/validation/CL/Tile.cpp b/tests/validation/CL/Tile.cpp
index 73f4aa82a2..a06c05744f 100644
--- a/tests/validation/CL/Tile.cpp
+++ b/tests/validation/CL/Tile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CL/Transpose.cpp b/tests/validation/CL/Transpose.cpp
index 3a1a27d8e4..876bf29dd5 100644
--- a/tests/validation/CL/Transpose.cpp
+++ b/tests/validation/CL/Transpose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,27 +68,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", { DataType::S8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32 })),
-               shape, data_type)
-{
-    // Make rows the columns of the original shape
-    TensorShape output_shape{ shape[1], shape[0] };
-
-    // Create tensors
-    CLTensor ref_src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst     = create_tensor<CLTensor>(output_shape, data_type);
-
-    // Create and Configure function
-    CLTranspose trans;
-    trans.configure(&ref_src, &dst);
-
-    // Validate dst region
-    const ValidRegion valid_region = shape_to_valid_region(output_shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // TODO(bsgcomp): Add padding validation (COMPMID-659)
-}
-
 template <typename T>
 using CLTransposeFixture = TransposeValidationFixture<CLTensor, CLAccessor, CLTranspose, T>;
 
diff --git a/tests/validation/CL/UNIT/DynamicTensor.cpp b/tests/validation/CL/UNIT/DynamicTensor.cpp
index b6302846a7..833256039e 100644
--- a/tests/validation/CL/UNIT/DynamicTensor.cpp
+++ b/tests/validation/CL/UNIT/DynamicTensor.cpp
@@ -28,6 +28,12 @@
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/PoolManager.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLIm2ColKernel.h"
+#include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
+#include "src/core/CL/kernels/CLReductionOperationKernel.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/Globals.h"
diff --git a/tests/validation/CL/UNIT/Tuner.cpp b/tests/validation/CL/UNIT/Tuner.cpp
index ee5c76ce5f..cf2513bf2c 100644
--- a/tests/validation/CL/UNIT/Tuner.cpp
+++ b/tests/validation/CL/UNIT/Tuner.cpp
@@ -21,10 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
+#include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
 #include "tests/Utils.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
diff --git a/tests/validation/CL/UNIT/WeightsRetention.cpp b/tests/validation/CL/UNIT/WeightsRetention.cpp
index 7234e47642..acf795e48b 100644
--- a/tests/validation/CL/UNIT/WeightsRetention.cpp
+++ b/tests/validation/CL/UNIT/WeightsRetention.cpp
@@ -22,6 +22,18 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
+#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/Globals.h"
diff --git a/tests/validation/CL/WarpAffine.cpp b/tests/validation/CL/WarpAffine.cpp
index 7779761f1f..2dacb9fe98 100644
--- a/tests/validation/CL/WarpAffine.cpp
+++ b/tests/validation/CL/WarpAffine.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,47 +54,6 @@ constexpr AbsoluteTolerance<uint8_t> tolerance(1);
 TEST_SUITE(CL)
 TEST_SUITE(WarpAffine)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType", DataType::U8)),
-                                                                           framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                   datasets::BorderModes()),
-               shape, data_type, policy, border_mode)
-{
-    // Generate a random constant value if border_mode is constant
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-    uint8_t                                constant_border_value = distribution_u8(gen);
-
-    // Create the matrix
-    std::array<float, 9> matrix{ {} };
-    fill_warp_matrix<9>(matrix);
-
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLWarpAffine warp_affine;
-    warp_affine.configure(&src, &dst, matrix, policy, border_mode, constant_border_value);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    int               total_right  = ceil_to_multiple(shape[0], 4);
-    const int         access_right = total_right + (((total_right - shape[0]) == 0) ? 1 : 0);
-    const PaddingSize read_padding(1, access_right - shape[0], 1, 1);
-    validate(src.info()->padding(), read_padding);
-
-    PaddingCalculator calculator(shape.x(), 4);
-    validate(dst.info()->padding(), calculator.required_padding());
-}
-
 template <typename T>
 using CLWarpAffineFixture = WarpAffineValidationFixture<CLTensor, CLAccessor, CLWarpAffine, T>;
 
diff --git a/tests/validation/CL/WarpPerspective.cpp b/tests/validation/CL/WarpPerspective.cpp
index 4b975057fc..b934b70184 100644
--- a/tests/validation/CL/WarpPerspective.cpp
+++ b/tests/validation/CL/WarpPerspective.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,53 +50,6 @@ constexpr float                      tolerance_number = 0.2f;
 TEST_SUITE(CL)
 TEST_SUITE(WarpPerspective)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType", DataType::U8)),
-                                                                           framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                   datasets::BorderModes()),
-               shape, data_type, policy, border_mode)
-{
-    uint8_t constant_border_value = 0;
-
-    // Generate a random constant value if border_mode is constant
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-        constant_border_value = distribution_u8(gen);
-    }
-
-    // Create the matrix
-    std::array<float, 9> matrix = { { 0 } };
-    fill_warp_matrix<9>(matrix);
-
-    // Create tensors
-    CLTensor src = create_tensor<CLTensor>(shape, data_type);
-    CLTensor dst = create_tensor<CLTensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLWarpPerspective warp_perspective;
-    warp_perspective.configure(&src, &dst, matrix, policy, border_mode, constant_border_value);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 4);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize read_padding(1);
-    const PaddingSize write_padding = calculator.required_padding(PaddingCalculator::Option::EXCLUDE_BORDER);
-
-    validate(src.info()->padding(), read_padding);
-    validate(dst.info()->padding(), write_padding);
-}
-
 template <typename T>
 using CLWarpPerspectiveFixture = WarpPerspectiveValidationFixture<CLTensor, CLAccessor, CLWarpPerspective, T>;
 
diff --git a/tests/validation/CL/WeightsReshape.cpp b/tests/validation/CL/WeightsReshape.cpp
index 3e7ecc3408..d04c10cee2 100644
--- a/tests/validation/CL/WeightsReshape.cpp
+++ b/tests/validation/CL/WeightsReshape.cpp
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
+#include "src/core/CL/kernels/CLWeightsReshapeKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/datasets/ShapeDatasets.h"
diff --git a/tests/validation/CL/WidthConcatenateLayer.cpp b/tests/validation/CL/WidthConcatenateLayer.cpp
index 408fe148d7..ded1f29ce3 100644
--- a/tests/validation/CL/WidthConcatenateLayer.cpp
+++ b/tests/validation/CL/WidthConcatenateLayer.cpp
@@ -84,28 +84,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-TEST_CASE(Configuration, framework::DatasetMode::ALL)
-{
-    // Create tensors
-    CLTensor src1 = create_tensor<CLTensor>(TensorShape(128U, 32U, 32U), DataType::F32, 1);
-    CLTensor src2 = create_tensor<CLTensor>(TensorShape(32U, 32U, 32U), DataType::F32, 1);
-    CLTensor src3 = create_tensor<CLTensor>(TensorShape(15U, 32U, 32U), DataType::F32, 1);
-    CLTensor dst;
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src3.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLConcatenateLayer             concat_layer;
-    std::vector<const ICLTensor *> inputs;
-    inputs.emplace_back(&src1);
-    inputs.emplace_back(&src2);
-    inputs.emplace_back(&src3);
-    concat_layer.configure(inputs, &dst, 0);
-}
-
 template <typename T>
 using CLWidthConcatenateLayerFixture = ConcatenateLayerValidationFixture<CLTensor, ICLTensor, CLAccessor, CLConcatenateLayer, T>;
 
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 771acf9461..750799ace2 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -30,6 +28,8 @@
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
+#include "src/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "src/core/CL/kernels/CLWinogradOutputTransformKernel.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -182,6 +182,7 @@ const auto ActivationFunctionsSmallDataset = framework::dataset::make("Activatio
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::SOFT_RELU)
 });
+
 } // namespace
 
 using namespace arm_compute::misc::shape_calculator;
@@ -190,6 +191,7 @@ TEST_SUITE(CL)
 TEST_SUITE(Winograd)
 
 TEST_SUITE(InputTransform)
+
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                 framework::dataset::make("InputInfo",{
                                                                                         TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F16),     // F16 not supported
diff --git a/tests/validation/CL/YOLOLayer.cpp b/tests/validation/CL/YOLOLayer.cpp
index f28082b74b..95c18d3d95 100644
--- a/tests/validation/CL/YOLOLayer.cpp
+++ b/tests/validation/CL/YOLOLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/CPP/Permute.cpp b/tests/validation/CPP/Permute.cpp
index f0f5346aa7..9495fa738e 100644
--- a/tests/validation/CPP/Permute.cpp
+++ b/tests/validation/CPP/Permute.cpp
@@ -58,29 +58,6 @@ const auto PermuteParametersLarge = datasets::Large4DShapes() * PermuteVectors;
 TEST_SUITE(CPP)
 TEST_SUITE(Permute)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small4DShapes(), framework::dataset::make("DataType", { DataType::S8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED })),
-               shape, data_type)
-{
-    // Define permutation vector
-    const PermutationVector perm(2U, 0U, 1U);
-
-    // Permute shapes
-    TensorShape output_shape = shape;
-    permute(output_shape, perm);
-
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(output_shape, data_type);
-
-    // Create and Configure function
-    CPPPermute perm_func;
-    perm_func.configure(&ref_src, &dst, perm);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(output_shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using CPPPermuteFixture = PermuteValidationFixture<Tensor, Accessor, CPPPermute, T>;
 
diff --git a/tests/validation/GLES_COMPUTE/ActivationLayer.cpp b/tests/validation/GLES_COMPUTE/ActivationLayer.cpp
index 21384c4721..63c0423e7a 100644
--- a/tests/validation/GLES_COMPUTE/ActivationLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/ActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,49 +88,6 @@ const auto ActivationDataset = combine(combine(framework::dataset::make("InPlace
 
 TEST_SUITE(GC)
 TEST_SUITE(ActivationLayer)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), CNNDataTypes), framework::dataset::make("InPlace", { false, true })),
-               shape, data_type, in_place)
-{
-    // Create tensors
-    GCTensor src = create_tensor<GCTensor>(shape, data_type, 1);
-    GCTensor dst = create_tensor<GCTensor>(shape, data_type, 1);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    GCActivationLayer act_layer;
-
-    if(in_place)
-    {
-        act_layer.configure(&src, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS));
-    }
-    else
-    {
-        act_layer.configure(&src, &dst, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS));
-    }
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-
-    if(!in_place)
-    {
-        validate(dst.info()->valid_region(), valid_region);
-    }
-
-    // Validate padding
-    const int         step    = (arm_compute::data_size_from_type(data_type) == 4 ? 1 : 2);
-    const PaddingSize padding = PaddingCalculator(shape.x(), step).required_padding();
-    validate(src.info()->padding(), padding);
-
-    if(!in_place)
-    {
-        validate(dst.info()->padding(), padding);
-    }
-}
-
 template <typename T>
 using GCActivationLayerFixture = ActivationValidationFixture<GCTensor, GCAccessor, GCActivationLayer, T>;
 
diff --git a/tests/validation/GLES_COMPUTE/BatchNormalizationLayer.cpp b/tests/validation/GLES_COMPUTE/BatchNormalizationLayer.cpp
index 474dd02b3d..0866dcd0aa 100644
--- a/tests/validation/GLES_COMPUTE/BatchNormalizationLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/BatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,37 +67,6 @@ TEST_SUITE(BatchNormalizationLayer)
 template <typename T>
 using GCBatchNormalizationLayerFixture = BatchNormalizationLayerValidationFixture<GCTensor, GCAccessor, GCBatchNormalizationLayer, T>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallRandomBatchNormalizationLayerDataset(),
-                                                                                   data_GB),
-                                                                           framework::dataset::make("DataType", { DataType::F32 })),
-                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW })),
-               shape0, shape1, epsilon, use_beta, use_gamma, dt, data_layout)
-{
-    TensorShape src_dst_shapes = shape0;
-    if(data_layout == DataLayout::NHWC)
-    {
-        permute(src_dst_shapes, PermutationVector(2U, 0U, 1U));
-    }
-
-    // Create tensors
-    GCTensor src   = create_tensor<GCTensor>(src_dst_shapes, dt, 1, QuantizationInfo(), data_layout);
-    GCTensor dst   = create_tensor<GCTensor>(src_dst_shapes, dt, 1, QuantizationInfo(), data_layout);
-    GCTensor mean  = create_tensor<GCTensor>(shape1, dt, 1);
-    GCTensor var   = create_tensor<GCTensor>(shape1, dt, 1);
-    GCTensor beta  = create_tensor<GCTensor>(shape1, dt, 1);
-    GCTensor gamma = create_tensor<GCTensor>(shape1, dt, 1);
-
-    // Create and Configure function
-    GCBatchNormalizationLayer norm;
-    GCTensor                 *beta_ptr  = use_beta ? &beta : nullptr;
-    GCTensor                 *gamma_ptr = use_gamma ? &gamma : nullptr;
-    norm.configure(&src, &dst, &mean, &var, beta_ptr, gamma_ptr, epsilon);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(src_dst_shapes);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(Random, GCBatchNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallRandomBatchNormalizationLayerDataset(), data_f16))
diff --git a/tests/validation/GLES_COMPUTE/DirectConvolutionLayer.cpp b/tests/validation/GLES_COMPUTE/DirectConvolutionLayer.cpp
index ee95a0145e..92d926c1a0 100644
--- a/tests/validation/GLES_COMPUTE/DirectConvolutionLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/DirectConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -80,8 +80,6 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
 TEST_SUITE(GC)
 TEST_SUITE(DirectConvolutionLayer)
 
-//TODO(COMPMID-415): Configuration tests?
-
 template <typename T>
 using GCDirectConvolutionLayerFixture = DirectConvolutionValidationFixture<GCTensor, GCAccessor, GCDirectConvolutionLayer, T>;
 
diff --git a/tests/validation/GLES_COMPUTE/FullyConnectedLayer.cpp b/tests/validation/GLES_COMPUTE/FullyConnectedLayer.cpp
index 53f63ce923..55c214f338 100644
--- a/tests/validation/GLES_COMPUTE/FullyConnectedLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/FullyConnectedLayer.cpp
@@ -63,46 +63,6 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
 TEST_SUITE(GC)
 TEST_SUITE(FullyConnectedLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallFullyConnectedLayerDataset(), datasets::LargeFullyConnectedLayerDataset()),
-                                                                           FullyConnectedParameters),
-                                                                   CNNDataTypes),
-               src_shape, weights_shape, bias_shape, dst_shape, transpose_weights, reshape_weights, data_type)
-{
-    TensorShape ws(weights_shape);
-
-    // Transpose weights if not done in the function
-    if(!reshape_weights || !transpose_weights)
-    {
-        const size_t shape_x = ws.x();
-        ws.set(0, ws.y());
-        ws.set(1, shape_x);
-    }
-
-    // Create tensors
-    GCTensor src     = create_tensor<GCTensor>(src_shape, data_type, 1);
-    GCTensor weights = create_tensor<GCTensor>(ws, data_type, 1);
-    GCTensor bias    = create_tensor<GCTensor>(bias_shape, data_type, 1);
-    GCTensor dst     = create_tensor<GCTensor>(dst_shape, data_type, 1);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create Fully Connected layer info
-    FullyConnectedLayerInfo fc_info;
-    fc_info.transpose_weights    = transpose_weights;
-    fc_info.are_weights_reshaped = !reshape_weights;
-
-    // Create and configure function.
-    GCFullyConnectedLayer fc;
-    fc.configure(&src, &weights, &bias, &dst, fc_info);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(dst_shape);
-    validate(dst.info()->valid_region(), dst_valid_region);
-}
-
 template <typename T>
 using GCFullyConnectedLayerFixture = FullyConnectedLayerValidationFixture<GCTensor, GCAccessor, GCFullyConnectedLayer, T>;
 
diff --git a/tests/validation/GLES_COMPUTE/GEMM.cpp b/tests/validation/GLES_COMPUTE/GEMM.cpp
index 13af521fda..7bcb2a9d1a 100644
--- a/tests/validation/GLES_COMPUTE/GEMM.cpp
+++ b/tests/validation/GLES_COMPUTE/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,27 +56,6 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 TEST_SUITE(GC)
 TEST_SUITE(GEMM)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallGEMMDataset(), datasets::LargeGEMMDataset()), CNNDataTypes),
-               shape_a, shape_b, shape_c, output_shape, alpha, beta, data_type)
-{
-    // Create tensors
-    GCTensor a   = create_tensor<GCTensor>(shape_a, data_type, 1);
-    GCTensor b   = create_tensor<GCTensor>(shape_b, data_type, 1);
-    GCTensor c   = create_tensor<GCTensor>(shape_c, data_type, 1);
-    GCTensor dst = create_tensor<GCTensor>(output_shape, data_type, 1);
-
-    ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    GCGEMM gemm;
-    gemm.configure(&a, &b, &c, &dst, alpha, beta);
-
-    //TODO(COMPMID-415): Validate valid region
-}
-
 template <typename T>
 using GCGEMMFixture = GEMMValidationFixture<GCTensor, GCAccessor, GCGEMM, T>;
 
diff --git a/tests/validation/GLES_COMPUTE/NormalizePlanarYUVLayer.cpp b/tests/validation/GLES_COMPUTE/NormalizePlanarYUVLayer.cpp
index ed6b5f0ecf..98b0ee5ade 100644
--- a/tests/validation/GLES_COMPUTE/NormalizePlanarYUVLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/NormalizePlanarYUVLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,24 +52,6 @@ TEST_SUITE(NormalizePlanarYUVLayer)
 template <typename T>
 using GCNormalizePlanarYUVLayerFixture = NormalizePlanarYUVLayerValidationFixture<GCTensor, GCAccessor, GCNormalizePlanarYUVLayer, T>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::RandomNormalizePlanarYUVLayerDataset(), framework::dataset::make("DataType", { DataType::F16 })),
-               shape0, shape1, dt)
-{
-    // Create tensors
-    GCTensor src  = create_tensor<GCTensor>(shape0, dt, 1);
-    GCTensor dst  = create_tensor<GCTensor>(shape0, dt, 1);
-    GCTensor mean = create_tensor<GCTensor>(shape1, dt, 1);
-    GCTensor sd   = create_tensor<GCTensor>(shape1, dt, 1);
-
-    // Create and Configure function
-    GCNormalizePlanarYUVLayer norm;
-    norm.configure(&src, &dst, &mean, &sd);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape0);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
diff --git a/tests/validation/GLES_COMPUTE/Scale.cpp b/tests/validation/GLES_COMPUTE/Scale.cpp
index 5e7f39f284..a8859d6f00 100644
--- a/tests/validation/GLES_COMPUTE/Scale.cpp
+++ b/tests/validation/GLES_COMPUTE/Scale.cpp
@@ -65,50 +65,6 @@ RelativeTolerance<half> tolerance_f16(half(0.1));
 TEST_SUITE(GC)
 TEST_SUITE(Scale)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(concat(datasets::MediumShapes(), datasets::LargeShapes()), ScaleDataTypes),
-                                                                                   framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR })),
-                                                                           datasets::BorderModes()),
-                                                                   datasets::SamplingPolicies()),
-               shape, data_type, policy, border_mode, sampling_policy)
-{
-    std::mt19937                           generator(library->seed());
-    std::uniform_real_distribution<float>  distribution_float(0.25, 2);
-    const float                            scale_x = distribution_float(generator);
-    const float                            scale_y = distribution_float(generator);
-    std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-    uint8_t                                constant_border_value = distribution_u8(generator);
-
-    // Create tensors
-    GCTensor    src = create_tensor<GCTensor>(shape, data_type);
-    TensorShape shape_scaled(shape);
-    shape_scaled.set(0, shape[0] * scale_x);
-    shape_scaled.set(1, shape[1] * scale_y);
-    GCTensor dst = create_tensor<GCTensor>(shape_scaled, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    GCScale gcscale;
-    gcscale.configure(&src, &dst, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy });
-
-    // Get border size depending on border mode
-    const BorderSize border_size(border_mode == BorderMode::UNDEFINED ? 0 : 1);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = calculate_valid_region_scale(*(src.info()), shape_scaled, policy, sampling_policy, (border_mode == BorderMode::UNDEFINED));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape_scaled.x(), 4);
-    calculator.set_border_mode(border_mode);
-
-    //const PaddingSize read_padding(border_size);
-    const PaddingSize write_padding = calculator.required_padding(PaddingCalculator::Option::EXCLUDE_BORDER);
-    //validate(src.info()->padding(), read_padding);
-    validate(dst.info()->padding(), write_padding);
-}
-
 template <typename T>
 using GCScaleFixture = ScaleValidationFixture<GCTensor, GCAccessor, GCScale, T>;
 
diff --git a/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp b/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
index af92cff813..863ce7b999 100644
--- a/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
@@ -57,30 +57,6 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 TEST_SUITE(GC)
 TEST_SUITE(SoftmaxLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(concat(datasets::SoftmaxLayerSmallShapes(), datasets::SoftmaxLayerLargeShapes()), CNNDataTypes), shape, data_type)
-{
-    // Create tensors
-    GCTensor src = create_tensor<GCTensor>(shape, data_type, 1);
-    GCTensor dst = create_tensor<GCTensor>(shape, data_type, 1);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    GCSoftmaxLayer smx_layer;
-    smx_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 8).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using GCSoftmaxLayerFixture = SoftmaxValidationFixture<GCTensor, GCAccessor, GCSoftmaxLayer, T>;
 
@@ -89,7 +65,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
                                                                                                                      framework::dataset::make("Beta", 1.0f)),
-                                                                                                                     framework::dataset::make("ReduceEndAxis", 0)))
+                                                                                                                     framework::dataset::make("Axis", 0)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16);
@@ -97,18 +73,18 @@ FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<half_float::half>, framew
 FIXTURE_DATA_TEST_CASE(RunLarge, GCSoftmaxLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                                                                    framework::dataset::make("Beta", 1.0f)),
-                                                                                                                   framework::dataset::make("ReduceEndAxis", 0)))
+                                                                                                                   framework::dataset::make("Axis", 0)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
                                                                                                                   framework::dataset::make("Beta", 1.0f)),
-                                                                                                          framework::dataset::make("ReduceEndAxis", 0)))
+                                                                                                          framework::dataset::make("Axis", 0)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);
@@ -116,16 +92,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<float>, framework::Datase
 FIXTURE_DATA_TEST_CASE(RunLarge, GCSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                         framework::dataset::make("DataType", DataType::F32)),
                                                                                                                 framework::dataset::make("Beta", 1.0f)),
-                                                                                                        framework::dataset::make("ReduceEndAxis", 0)))
+                                                                                                        framework::dataset::make("Axis", 0)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // SoftmaxLayer
+TEST_SUITE_END() // GC
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/GLES_COMPUTE/Transpose.cpp b/tests/validation/GLES_COMPUTE/Transpose.cpp
index d1c640dc44..90bd53a909 100644
--- a/tests/validation/GLES_COMPUTE/Transpose.cpp
+++ b/tests/validation/GLES_COMPUTE/Transpose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,27 +45,6 @@ namespace validation
 TEST_SUITE(GC)
 TEST_SUITE(Transpose)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })),
-               shape, data_type)
-{
-    // Make rows the columns of the original shape
-    TensorShape output_shape{ shape[1], shape[0] };
-
-    // Create tensors
-    GCTensor ref_src = create_tensor<GCTensor>(shape, data_type);
-    GCTensor dst     = create_tensor<GCTensor>(output_shape, data_type);
-
-    // Create and Configure function
-    GCTranspose trans;
-    trans.configure(&ref_src, &dst);
-
-    // Validate dst region
-    const ValidRegion valid_region = shape_to_valid_region(output_shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // TODO(bsgcomp): Add padding validation (COMPMID-659)
-}
-
 template <typename T>
 using GCTransposeFixture = TransposeValidationFixture<GCTensor, GCAccessor, GCTranspose, T>;
 
diff --git a/tests/validation/NEON/AbsoluteDifference.cpp b/tests/validation/NEON/AbsoluteDifference.cpp
index 9e9a7db738..36499a4552 100644
--- a/tests/validation/NEON/AbsoluteDifference.cpp
+++ b/tests/validation/NEON/AbsoluteDifference.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,28 +57,6 @@ template <typename T>
 using NEAbsoluteDifferenceFixture = AbsoluteDifferenceValidationFixture<Tensor, Accessor, NEAbsoluteDifference, T>;
 
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AbsoluteDifferenceU8Dataset),
-               shape, data_type0, data_type1, output_data_type)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, data_type0);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, data_type1);
-    Tensor dst      = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create and Configure function
-    NEAbsoluteDifference abs_diff;
-    abs_diff.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
 
 FIXTURE_DATA_TEST_CASE(RunSmall, NEAbsoluteDifferenceFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), AbsoluteDifferenceU8Dataset))
 {
@@ -93,28 +71,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEAbsoluteDifferenceFixture<uint8_t>, framework
 TEST_SUITE_END() // U8
 
 TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AbsoluteDifferenceS16Dataset),
-               shape, data_type0, data_type1, output_data_type)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, data_type0);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, data_type1);
-    Tensor dst      = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create and Configure function
-    NEAbsoluteDifference abs_diff;
-    abs_diff.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
 
 FIXTURE_DATA_TEST_CASE(RunSmall, NEAbsoluteDifferenceFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), AbsoluteDifferenceS16Dataset))
 {
diff --git a/tests/validation/NEON/Accumulate.cpp b/tests/validation/NEON/Accumulate.cpp
index e49069876f..963d697222 100644
--- a/tests/validation/NEON/Accumulate.cpp
+++ b/tests/validation/NEON/Accumulate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,26 +53,6 @@ TEST_SUITE(NEON)
 TEST_SUITE(Accumulate)
 
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AccumulateS16Dataset),
-               shape, data_type, output_data_type)
-{
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create and Configure function
-    NEAccumulate accum;
-    accum.configure(&ref_src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
 
 template <typename T1>
 using NEAccumulateFixture = AccumulateValidationFixture<Tensor, Accessor, NEAccumulate, T1, int16_t>;
@@ -94,31 +74,6 @@ TEST_SUITE_END() // Accumulate
 TEST_SUITE(AccumulateWeighted)
 
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AccumulateU8Dataset),
-               shape, data_type, output_data_type)
-{
-    // Generate a random alpha value
-    std::mt19937                     gen(library->seed());
-    std::uniform_real_distribution<> float_dist(0, 1);
-    const float                      alpha = float_dist(gen);
-
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create and Configure function
-    NEAccumulateWeighted accum_weight;
-    accum_weight.configure(&ref_src, alpha, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
 
 template <typename T1>
 using NEAccumulateWeightedFixture = AccumulateWeightedValidationFixture<Tensor, Accessor, NEAccumulateWeighted, T1, uint8_t>;
@@ -140,31 +95,6 @@ TEST_SUITE_END() // AccumulateWeighted
 TEST_SUITE(AccumulateSquared)
 
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), AccumulateS16Dataset),
-               shape, data_type, output_data_type)
-{
-    // Generate a random shift value
-    std::mt19937                            gen(library->seed());
-    std::uniform_int_distribution<uint32_t> int_dist(0, 15);
-    const uint32_t                          shift = int_dist(gen);
-
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create and Configure function
-    NEAccumulateSquared accum_square;
-    accum_square.configure(&ref_src, shift, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(ref_src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
 
 template <typename T1>
 using NEAccumulateSquaredFixture = AccumulateSquaredValidationFixture<Tensor, Accessor, NEAccumulateSquared, T1, int16_t>;
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 33e2850cb1..0ef4590d7e 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/RuntimeContext.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -36,6 +37,8 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ActivationLayerFixture.h"
 
+#include "support/Requires.h"
+
 namespace arm_compute
 {
 namespace test
@@ -44,6 +47,9 @@ namespace validation
 {
 namespace
 {
+RelativeTolerance<float> tolerance_float_sqrt(0.0001f);
+    
+    
 /** Define relative tolerance of the activation layer.
  *
  * @param[in] data_type  The data type used.
@@ -123,52 +129,47 @@ const auto NeonActivationFunctionsDataset = concat(datasets::ActivationFunctions
 
 /** Input data sets. */
 const auto ActivationDataset = combine(combine(framework::dataset::make("InPlace", { false, true }), NeonActivationFunctionsDataset), framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
-} // namespace
 
-TEST_SUITE(NEON)
-TEST_SUITE(ActivationLayer)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), CNNDataTypes), framework::dataset::make("InPlace", { false, true })),
-               shape, data_type, in_place)
+template <typename T, REQUIRES_TA(arm_compute::utils::traits::is_floating_point<T>::value)>
+void test_float_sqrt_boundary_value()
 {
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type, 1);
-    Tensor dst = create_tensor<Tensor>(shape, data_type, 1);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create context
-    RuntimeContext ctx;
+    constexpr auto vector_size = uint32_t{ 16 };
 
-    // Create and configure function
-    NEActivationLayer act_layer(&ctx);
+    auto data_type = DataType::F32;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    data_type = std::is_same<T, half>::value ? DataType::F16 : data_type;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-    if(in_place)
-    {
-        act_layer.configure(&src, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS));
-    }
-    else
+    const auto boundary_value_vector = std::vector<T>
     {
-        act_layer.configure(&src, &dst, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS));
-    }
+        std::numeric_limits<T>::min(),
+        T(0),
+        std::numeric_limits<T>::epsilon(),
+        std::numeric_limits<T>::max(),
+    };
 
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
+    // the following size ensures that the whole logic (vector + left-over) to be tested
+    // using all boundary values iff boundary_value_vecotr.size() is smaller than vector_size.
+    auto shape = TensorShape{ vector_size + boundary_value_vector.size() };
+    auto info  = ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::SQRT };
+    auto src   = create_tensor<Tensor>(shape, data_type);
 
-    if(!in_place)
-    {
-        validate(dst.info()->valid_region(), valid_region);
-    }
+    auto act = NEActivationLayer{};
+    act.configure(&src, nullptr, info);
+    src.allocator()->allocate();
+    library->fill_static_values(Accessor(src), boundary_value_vector);
+    act.run();
 
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    if(!in_place)
-    {
-        validate(dst.info()->padding(), PaddingSize());
-    }
+    auto reference_src = SimpleTensor<T> { shape, data_type };
+    library->fill_static_values(reference_src, boundary_value_vector);
+    auto reference_dst = reference::activation_layer<T>(reference_src, info);
+
+    validate(Accessor(src), reference_dst, tolerance_float_sqrt);
 }
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(ActivationLayer)
 
 // *INDENT-OFF*
 // clang-format off
@@ -200,6 +201,10 @@ using NEActivationLayerFixture = ActivationValidationFixture<Tensor, Accessor, N
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
+TEST_CASE(SqrtBoundaryValue, framework::DatasetMode::ALL)
+{
+    test_float_sqrt_boundary_value<half>();
+}
 FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ActivationDataset),
                                                                                                       framework::dataset::make("DataType",
                                                                                                               DataType::F16)))
@@ -207,10 +212,14 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerFixture<half>, framework::Data
     // Validate output
     validate(Accessor(_target), _reference, relative_tolerance(_data_type, _function), 0.f, absolute_tolerance(_data_type, _function));
 }
-TEST_SUITE_END()
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
+TEST_CASE(SqrtBoundaryValue, framework::DatasetMode::ALL)
+{
+    test_float_sqrt_boundary_value<float>();
+}
 FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ActivationDataset), framework::dataset::make("DataType",
                                                                                                        DataType::F32)))
 
diff --git a/tests/validation/NEON/ArithmeticSubtraction.cpp b/tests/validation/NEON/ArithmeticSubtraction.cpp
index f468f6d5c6..12fe64c396 100644
--- a/tests/validation/NEON/ArithmeticSubtraction.cpp
+++ b/tests/validation/NEON/ArithmeticSubtraction.cpp
@@ -70,6 +70,10 @@ const auto ArithmeticSubtractionU8Dataset = combine(combine(framework::dataset::
 const auto ArithmeticSubtractionS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }),
                                                              framework::dataset::make("DataType", DataType::S16)),
                                                      framework::dataset::make("DataType", DataType::S16));
+
+const auto ArithmeticSubtractionS32Dataset = combine(combine(framework::dataset::make("DataType", DataType::S32),
+                                                             framework::dataset::make("DataType", DataType::S32)),
+                                                     framework::dataset::make("DataType", DataType::S32));
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 const auto ArithmeticSubtractionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16),
                                                               framework::dataset::make("DataType", DataType::F16)),
@@ -120,12 +124,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
+                                                TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
         })),
         framework::dataset::make("ConvertPolicy",{ ConvertPolicy::WRAP,
                                                 ConvertPolicy::SATURATE,
                                                 ConvertPolicy::SATURATE,
                                                 ConvertPolicy::WRAP,
                                                 ConvertPolicy::WRAP,
+                                                ConvertPolicy::WRAP,
         })),
         framework::dataset::make("Expected", { true, true, false, false, false, false})),
         input1_info, input2_info, output_info, policy, expected)
@@ -270,6 +276,24 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int16_t>, framew
 }
 TEST_SUITE_END() // S16
 
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionS32Dataset),
+                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                     OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticSubtractionS32Dataset),
+                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                   OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
diff --git a/tests/validation/NEON/BitwiseAnd.cpp b/tests/validation/NEON/BitwiseAnd.cpp
index f10be8d174..8796cc1147 100644
--- a/tests/validation/NEON/BitwiseAnd.cpp
+++ b/tests/validation/NEON/BitwiseAnd.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,34 +43,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(BitwiseAnd)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src1 = create_tensor<Tensor>(shape, data_type);
-    Tensor src2 = create_tensor<Tensor>(shape, data_type);
-    Tensor dst  = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEBitwiseAnd bitwise_and;
-    bitwise_and.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src1.info()->valid_region(), valid_region);
-    validate(src2.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using NEBitwiseAndFixture = BitwiseAndValidationFixture<Tensor, Accessor, NEBitwiseAnd, T>;
 
diff --git a/tests/validation/NEON/BitwiseNot.cpp b/tests/validation/NEON/BitwiseNot.cpp
index a53e77d472..13b3e6d7f3 100644
--- a/tests/validation/NEON/BitwiseNot.cpp
+++ b/tests/validation/NEON/BitwiseNot.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,30 +43,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(BitwiseNot)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEBitwiseNot bitwise_not;
-    bitwise_not.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using NEBitwiseNotFixture = BitwiseNotValidationFixture<Tensor, Accessor, NEBitwiseNot, T>;
 
diff --git a/tests/validation/NEON/BitwiseOr.cpp b/tests/validation/NEON/BitwiseOr.cpp
index f74594a293..cef712f626 100644
--- a/tests/validation/NEON/BitwiseOr.cpp
+++ b/tests/validation/NEON/BitwiseOr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,34 +43,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(BitwiseOr)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src1 = create_tensor<Tensor>(shape, data_type);
-    Tensor src2 = create_tensor<Tensor>(shape, data_type);
-    Tensor dst  = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEBitwiseOr bitwise_or;
-    bitwise_or.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src1.info()->valid_region(), valid_region);
-    validate(src2.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using NEBitwiseOrFixture = BitwiseOrValidationFixture<Tensor, Accessor, NEBitwiseOr, T>;
 
diff --git a/tests/validation/NEON/BitwiseXor.cpp b/tests/validation/NEON/BitwiseXor.cpp
index 094a69b599..3f973935df 100644
--- a/tests/validation/NEON/BitwiseXor.cpp
+++ b/tests/validation/NEON/BitwiseXor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,34 +43,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(BitwiseXor)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src1 = create_tensor<Tensor>(shape, data_type);
-    Tensor src2 = create_tensor<Tensor>(shape, data_type);
-    Tensor dst  = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEBitwiseXor bitwise_xor;
-    bitwise_xor.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src1.info()->valid_region(), valid_region);
-    validate(src2.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using NEBitwiseXorFixture = BitwiseXorValidationFixture<Tensor, Accessor, NEBitwiseXor, T>;
 
diff --git a/tests/validation/NEON/Box3x3.cpp b/tests/validation/NEON/Box3x3.cpp
index b4fd06a549..ef964bf856 100644
--- a/tests/validation/NEON/Box3x3.cpp
+++ b/tests/validation/NEON/Box3x3.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,41 +50,6 @@ constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kerne
 TEST_SUITE(NEON)
 TEST_SUITE(Box3x3)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEBox3x3 box3x3;
-    box3x3.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEBox3x3Fixture = Box3x3ValidationFixture<Tensor, Accessor, NEBox3x3, T>;
 
diff --git a/tests/validation/NEON/CannyEdge.cpp b/tests/validation/NEON/CannyEdge.cpp
index 42222c0450..da33ff9eeb 100644
--- a/tests/validation/NEON/CannyEdge.cpp
+++ b/tests/validation/NEON/CannyEdge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,41 +55,6 @@ const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }),
 TEST_SUITE(NEON)
 TEST_SUITE(CannyEdge)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), data), framework::dataset::make("Format", Format::U8)),
-               shape, gradient_size, normalization, border_mode, format)
-{
-    CannyEdgeParameters params = canny_edge_parameters();
-    // Convert normalisation type to integer
-    const auto norm_type = static_cast<int>(normalization) + 1;
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type_from_format(format));
-    Tensor dst = create_tensor<Tensor>(shape, data_type_from_format(format));
-    src.info()->set_format(format);
-    dst.info()->set_format(format);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create Canny edge configure function
-    NECannyEdge canny_edge;
-    canny_edge.configure(&src, &dst, params.upper_thresh, params.lower_thresh, gradient_size, norm_type, border_mode, params.constant_border_value);
-
-    // Validate valid region
-    validate(src.info()->valid_region(), shape_to_valid_region(shape, (BorderMode::UNDEFINED == border_mode)));
-    validate(dst.info()->valid_region(), shape_to_valid_region(shape, (BorderMode::UNDEFINED == border_mode)));
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(gradient_size / 2);
-    calculator.set_access_offset(-gradient_size / 2);
-    calculator.set_accessed_elements(16);
-
-    validate(src.info()->padding(), calculator.required_padding());
-    validate(dst.info()->padding(), PaddingSize{ 1 });
-}
-
 template <typename T>
 using NECannyEdgeFixture = CannyEdgeValidationFixture<Tensor, Accessor, KeyPointArray, NECannyEdge, T>;
 
diff --git a/tests/validation/NEON/ChannelCombine.cpp b/tests/validation/NEON/ChannelCombine.cpp
index 1a400f2a7d..8ca9828bed 100644
--- a/tests/validation/NEON/ChannelCombine.cpp
+++ b/tests/validation/NEON/ChannelCombine.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,56 +42,9 @@ namespace test
 {
 namespace validation
 {
-namespace
-{
-inline void validate_configuration(const TensorShape &shape, Format format)
-{
-    const int num_planes = num_planes_from_format(format);
-
-    // Create tensors
-    MultiImage          dst     = create_multi_image<MultiImage>(shape, format);
-    std::vector<Tensor> ref_src = create_tensor_planes<Tensor>(shape, format);
-
-    // Create and configure function
-    NEChannelCombine channel_combine;
-
-    if(num_planes == 1)
-    {
-        const Tensor *tensor_extra = Format::RGBA8888 == format ? &ref_src[3] : nullptr;
-
-        channel_combine.configure(&ref_src[0], &ref_src[1], &ref_src[2], tensor_extra, dst.plane(0));
-    }
-    else
-    {
-        channel_combine.configure(&ref_src[0], &ref_src[1], &ref_src[2], &dst);
-    }
-
-    // TODO(bsgcomp): Add validation for padding and shape (COMPMID-659)
-}
-} // namespace
-
 TEST_SUITE(NEON)
 TEST_SUITE(ChannelCombine)
 
-TEST_SUITE(Configuration)
-DATA_TEST_CASE(RGBA, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::RGB888, Format::RGBA8888 })),
-               shape, format)
-{
-    validate_configuration(shape, format);
-}
-DATA_TEST_CASE(YUV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::YUYV422, Format::UYVY422 })),
-               shape, format)
-{
-    validate_configuration(shape, format);
-}
-
-DATA_TEST_CASE(YUVPlanar, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444, Format::NV12, Format::NV21 })),
-               shape, format)
-{
-    validate_configuration(shape, format);
-}
-TEST_SUITE_END() // Configuration
-
 template <typename T>
 using NEChannelCombineFixture = ChannelCombineValidationFixture<MultiImage, Tensor, Accessor, NEChannelCombine, T>;
 
diff --git a/tests/validation/NEON/ChannelExtract.cpp b/tests/validation/NEON/ChannelExtract.cpp
index db7a9cf65f..d8b1921767 100644
--- a/tests/validation/NEON/ChannelExtract.cpp
+++ b/tests/validation/NEON/ChannelExtract.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,34 +51,6 @@ const auto ChannelExtractYUVDataset = combine(framework::dataset::make("FormatTy
                                               framework::dataset::make("ChannelType", { Channel::Y, Channel::U, Channel::V }));
 const auto ChannelExtractYUVPlanarDataset = combine(framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444, Format::NV12, Format::NV21 }),
                                                     framework::dataset::make("ChannelType", { Channel::Y, Channel::U, Channel::V }));
-
-inline void validate_configuration(const TensorShape &shape, Format format, Channel channel)
-{
-    const unsigned int num_planes = num_planes_from_format(format);
-
-    TensorShape dst_shape = adjust_odd_shape(shape, format);
-    dst_shape             = calculate_subsampled_shape(dst_shape, format, channel);
-
-    // Create tensors
-    MultiImage ref_src = create_multi_image<MultiImage>(shape, format);
-    Tensor     dst     = create_tensor<Tensor>(dst_shape, Format::U8);
-
-    // Create and Configure function
-    NEChannelExtract channel_extract;
-
-    if(1U == num_planes)
-    {
-        const Tensor *plane_src = ref_src.plane(0);
-
-        channel_extract.configure(plane_src, channel, &dst);
-    }
-    else
-    {
-        channel_extract.configure(&ref_src, channel, &dst);
-    }
-
-    // TODO(bsgcomp): Add validation for padding and shape (COMPMID-659)
-}
 } // namespace
 
 TEST_SUITE(NEON)
@@ -87,25 +59,6 @@ TEST_SUITE(ChannelExtract)
 template <typename T>
 using NEChannelExtractFixture = ChannelExtractValidationFixture<MultiImage, Tensor, Accessor, NEChannelExtract, T>;
 
-TEST_SUITE(Configuration)
-DATA_TEST_CASE(RGBA, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ChannelExtractRGBADataset),
-               shape, format, channel)
-{
-    validate_configuration(shape, format, channel);
-}
-DATA_TEST_CASE(YUV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ChannelExtractYUVDataset),
-               shape, format, channel)
-{
-    validate_configuration(shape, format, channel);
-}
-
-DATA_TEST_CASE(YUVPlanar, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ChannelExtractYUVPlanarDataset),
-               shape, format, channel)
-{
-    validate_configuration(shape, format, channel);
-}
-TEST_SUITE_END() // Configuration
-
 TEST_SUITE(RGBA)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEChannelExtractFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ChannelExtractRGBADataset))
 {
diff --git a/tests/validation/NEON/ChannelShuffle.cpp b/tests/validation/NEON/ChannelShuffle.cpp
index d7b98d942a..d0fa82fc53 100644
--- a/tests/validation/NEON/ChannelShuffle.cpp
+++ b/tests/validation/NEON/ChannelShuffle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,25 +70,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallRandomChannelShuffleLayerDataset(),
-                       framework::dataset::make("DataType", { DataType::S8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32 })),
-               shape, num_groups, data_type)
-{
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(shape, data_type);
-
-    // Create and Configure function
-    NEChannelShuffleLayer channel_shuffle_func;
-    channel_shuffle_func.configure(&ref_src, &dst, num_groups);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEChannelShuffleLayerFixture = ChannelShuffleLayerValidationFixture<Tensor, Accessor, NEChannelShuffleLayer, T>;
 
diff --git a/tests/validation/NEON/ColorConvert.cpp b/tests/validation/NEON/ColorConvert.cpp
index be64ee0095..e079bd12f3 100644
--- a/tests/validation/NEON/ColorConvert.cpp
+++ b/tests/validation/NEON/ColorConvert.cpp
@@ -70,62 +70,6 @@ const auto ColorConvert_YUYVDataset_to_NVDataset = combine(YUYVDataset,
 
 const auto ColorConvert_NVDataset_to_YUVDataset = combine(framework::dataset::make("FormatType", { Format::NV12, Format::NV21 }),
                                                           framework::dataset::make("FormatType", { Format::IYUV, Format::YUV444 }));
-
-inline void validate_configuration(const TensorShape &shape, Format src_format, Format dst_format)
-{
-    const unsigned int src_num_planes = num_planes_from_format(src_format);
-    const unsigned int dst_num_planes = num_planes_from_format(dst_format);
-
-    TensorShape input = adjust_odd_shape(shape, src_format);
-    input             = adjust_odd_shape(input, dst_format);
-
-    // Create tensors
-    MultiImage ref_src = create_multi_image<MultiImage>(input, src_format);
-    MultiImage ref_dst = create_multi_image<MultiImage>(input, dst_format);
-
-    // Create and Configure function
-    NEColorConvert color_convert;
-
-    if(1U == src_num_planes)
-    {
-        const Tensor *src_plane = ref_src.plane(0);
-
-        if(1U == dst_num_planes)
-        {
-            Tensor *dst_plane = ref_dst.plane(0);
-            color_convert.configure(src_plane, dst_plane);
-        }
-        else
-        {
-            color_convert.configure(src_plane, &ref_dst);
-        }
-    }
-    else
-    {
-        if(1U == dst_num_planes)
-        {
-            Tensor *dst_plane = ref_dst.plane(0);
-            color_convert.configure(&ref_src, dst_plane);
-        }
-        else
-        {
-            color_convert.configure(&ref_src, &ref_dst);
-        }
-    }
-
-    for(unsigned int plane_idx = 0; plane_idx < src_num_planes; ++plane_idx)
-    {
-        const Tensor *src_plane = ref_src.plane(plane_idx);
-
-        ARM_COMPUTE_EXPECT(src_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-    for(unsigned int plane_idx = 0; plane_idx < dst_num_planes; ++plane_idx)
-    {
-        const Tensor *dst_plane = ref_dst.plane(plane_idx);
-
-        ARM_COMPUTE_EXPECT(dst_plane->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-}
 } // namespace
 
 TEST_SUITE(NEON)
@@ -134,56 +78,6 @@ TEST_SUITE(ColorConvert)
 template <typename T>
 using NEColorConvertFixture = ColorConvertValidationFixture<MultiImage, Tensor, Accessor, NEColorConvert, T>;
 
-TEST_SUITE(Configuration)
-DATA_TEST_CASE(RGBA, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_RGBA_to_RGB),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(RGB, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_RGB_to_RGBA),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(RGBtoU8, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_RGB_to_U8),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(YUV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_YUYVDataset_to_RGBDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(YUVPlanar, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_YUVPlanar_to_RGBDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(NV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_RGBDataset_to_NVDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(YUYVtoNV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_YUYVDataset_to_NVDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-
-DATA_TEST_CASE(NVtoYUV, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), ColorConvert_NVDataset_to_YUVDataset),
-               shape, src_format, dst_format)
-{
-    validate_configuration(shape, src_format, dst_format);
-}
-TEST_SUITE_END() // Configuration
-
 TEST_SUITE(RGBA)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEColorConvertFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), ColorConvert_RGBA_to_RGB))
 {
diff --git a/tests/validation/NEON/Comparisons.cpp b/tests/validation/NEON/Comparisons.cpp
index 8dc78d870c..b77bcdd4f0 100644
--- a/tests/validation/NEON/Comparisons.cpp
+++ b/tests/validation/NEON/Comparisons.cpp
@@ -43,15 +43,6 @@ namespace validation
 {
 namespace
 {
-const auto configure_dataset = combine(datasets::SmallShapes(),
-                                       framework::dataset::make("DataType", { DataType::QASYMM8,
-                                                                              DataType::QASYMM8_SIGNED,
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                                              DataType::F16,
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                                                                              DataType::F32
-                                                                            }));
-
 const auto run_small_dataset           = combine(datasets::ComparisonOperations(), datasets::SmallShapes());
 const auto run_small_broadcast_dataset = combine(datasets::ComparisonOperations(), datasets::SmallShapesBroadcast());
 const auto run_large_dataset           = combine(datasets::ComparisonOperations(), datasets::LargeShapes());
@@ -94,6 +85,17 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 template <typename T>
 using NEComparisonFixture = ComparisonValidationFixture<Tensor, Accessor, NEElementwiseComparison, T>;
 
+TEST_SUITE(Bool)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEComparisonFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(run_small_dataset, framework::dataset::make("DataType", DataType::U8)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END()
+
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
diff --git a/tests/validation/NEON/Convolution.cpp b/tests/validation/NEON/Convolution.cpp
index 96e07dd698..13bc34c995 100644
--- a/tests/validation/NEON/Convolution.cpp
+++ b/tests/validation/NEON/Convolution.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,46 +55,6 @@ constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
 TEST_SUITE(NEON)
 TEST_SUITE(CustomConvolution)
 TEST_SUITE(Square3x3)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                               datasets::BorderModes()),
-                                                                       framework::dataset::make("filter_size", { 3 })),
-               shape, output_data_type, border_mode, filter_size)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-    Tensor dst = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::array<int16_t, 9> conv = { 0 };
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEConvolution3x3 convolution;
-    convolution.configure(&src, &dst, conv.data(), 0, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution3x3, T>;
 
@@ -124,45 +84,6 @@ TEST_SUITE_END() // S16
 TEST_SUITE_END() // Square3x3
 
 TEST_SUITE(Square5x5)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                               datasets::BorderModes()),
-                                                                       framework::dataset::make("filter_size", { 5 })),
-               shape, output_data_type, border_mode, filter_size)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-    Tensor dst = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::array<int16_t, 25> conv = { 0 };
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEConvolution5x5 convolution;
-    convolution.configure(&src, &dst, conv.data(), 0, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(2);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-2);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution5x5, T>;
 
@@ -192,45 +113,6 @@ TEST_SUITE_END() // S16
 TEST_SUITE_END() // Square5x5
 
 TEST_SUITE(Square7x7)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                               datasets::BorderModes()),
-                                                                       framework::dataset::make("filter_size", { 7 })),
-               shape, output_data_type, border_mode, filter_size)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-    Tensor dst = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::array<int16_t, 49> conv = { 0 };
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEConvolution7x7 convolution;
-    convolution.configure(&src, &dst, conv.data(), 0, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(3);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-3);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution7x7, T>;
 
@@ -260,45 +142,6 @@ TEST_SUITE_END() // S16
 TEST_SUITE_END() // Square7x7
 
 TEST_SUITE(Square9x9)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                               datasets::BorderModes()),
-                                                                       framework::dataset::make("filter_size", { 9 })),
-               shape, output_data_type, border_mode, filter_size)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-    Tensor dst = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::array<int16_t, 81> conv = { 0 };
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEConvolution9x9 convolution;
-    convolution.configure(&src, &dst, conv.data(), 0, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_size / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(4);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-4);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEConvolutionFixture = ConvolutionSquareValidationFixture<Tensor, Accessor, NEConvolution9x9, T>;
 
@@ -328,51 +171,6 @@ TEST_SUITE_END() // S16
 TEST_SUITE_END() // Square9x9
 
 TEST_SUITE(Rectangle)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType",
-{ DataType::U8, DataType::S16 })),
-datasets::BorderModes()),
-framework::dataset::make("filter_width", { 3, 5, 7, 9 })),
-framework::dataset::make("filter_height", { 3, 5, 7, 9 })),
-shape, output_data_type, border_mode, filter_width, filter_height)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-    Tensor dst = create_tensor<Tensor>(shape, output_data_type);
-
-    // Create conv matrix
-    std::vector<int16_t> conv(filter_height * filter_width);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEConvolutionRectangle convolution;
-    convolution.configure(&src, &dst, conv.data(), filter_width, filter_height, 1, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), BorderSize(filter_height / 2, filter_width / 2));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(filter_width / 2);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-(filter_width / 2));
-
-    const PaddingSize width_padding = calculator.required_padding();
-
-    calculator.set_border_size(filter_height / 2);
-    calculator.set_access_offset(-(filter_height / 2));
-    const PaddingSize height_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), width_padding, height_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEConvolutionFixture = ConvolutionRectangleValidationFixture<Tensor, Accessor, NEConvolutionRectangle, T>;
 
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 80615c5d57..9dff36b139 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -45,6 +46,20 @@ namespace test
 {
 namespace validation
 {
+namespace detail
+{
+template <>
+void configure_conv_function<NEGEMMConv2d, Tensor>(NEGEMMConv2d &func,
+                                                   Tensor *src, const Tensor *weights, const Tensor *bias, Tensor *dst,
+                                                   const PadStrideInfo &info, const WeightsInfo &weights_info,
+                                                   const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+    ARM_COMPUTE_UNUSED(weights_info);
+
+    Conv2dInfo conv_info(info, dilation, act_info, false, num_groups);
+    func.configure(src, weights, bias, dst, conv_info);
+}
+} // namespace detail
 namespace
 {
 const RelativeTolerance<float> rel_tolerance_f32(0.01f);              /**< Relative tolerance for FP32 types */
@@ -368,7 +383,7 @@ TEST_SUITE_END() // WinogradLayer
 
 TEST_SUITE(GEMMConvolutionLayer)
 template <typename T>
-using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
+using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T>;
 
 TEST_SUITE(Float)
 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
@@ -413,10 +428,10 @@ TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
 template <typename T>
-using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
+using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEConvolutionLayer, T>;
 
 template <typename T>
-using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T, int8_t>;
+using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEConvolutionLayer, T, int8_t>;
 
 const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
 {
@@ -480,6 +495,84 @@ TEST_SUITE_END() // QSYMM8_PER_CHANNEL
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // GEMMConvolutionLayer
+
+TEST_SUITE(DirectGEMMConv2d)
+template <typename T>
+using NEDirectGEMMConv2dLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEGEMMConv2d, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                     framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                             ActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+#ifdef __aarch64__
+template <typename T>
+using NEDirectGEMMConv2dLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConv2d, T>;
+
+template <typename T>
+using NEDirectGEMMConv2dLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEGEMMConv2d, T, int8_t>;
+
+const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                        framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                                                                                                        QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })),
+                                                                                                                       QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE(QSYMM8_PER_CHANNEL)
+FIXTURE_DATA_TEST_CASE(RunSmallSigned, NEDirectGEMMConv2dLayerQuantizedPerChannelFixture<int8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                               framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
+                                                       framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                               QuantizationData),
+                                       QuantizedActivationFunctionsDataset),
+                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QSYMM8_PER_CHANNEL
+TEST_SUITE_END() // Quantized
+#endif           // __aarch64__
+
+TEST_SUITE_END() // DirectGEMMConv2d
+
 TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 407ebe362a..e255fc7b4d 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -177,8 +177,6 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Invalid output size
                                                         TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),     // Patch size bigger than input width
                                                         TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),     // Dilation < 1
-                                                        TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),     // Window shrinking
-                                                        TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::QASYMM8), // Window shrinking
                                                       }),
                 framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
@@ -188,8 +186,6 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                           TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
                                                           TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
-                                                          TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::QASYMM8),
                                                         })),
                 framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(2U), 1, DataType::F32),
@@ -199,8 +195,6 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(16U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(24U), 1, DataType::S32),
                                                        })),
                 framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
@@ -210,8 +204,6 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                          TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
                                                          TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
-                                                         TensorInfo(TensorShape(32U, 11U, 24U), 1, DataType::QASYMM8),
                                                        })),
                 framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
@@ -221,8 +213,6 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
                                                        PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 0, 0),
-                                                       PadStrideInfo(1, 1, 1, 0),
                                                       })),
                 framework::dataset::make("DepthMultiplier", { 1,
                                                               1,
@@ -232,8 +222,6 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                               1,
                                                               2,
                                                               2,
-                                                              2,
-                                                              3,
                                                              })),
                 framework::dataset::make("Dilation", { Size2D(1U, 1U),
                                                        Size2D(1U, 1U),
@@ -243,10 +231,8 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip
                                                        Size2D(1U, 1U),
                                                        Size2D(25U, 1U),
                                                        Size2D(0U, 1U),
-                                                       Size2D(1U, 1U),
-                                                       Size2D(1U, 1U),
                                                              })),
-                framework::dataset::make("Expected", { false, false, false, false, false, false,false, false, false, false })),
+                framework::dataset::make("Expected", { false, false, false, false, false, false, false, false})),
                 input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier,dilation, expected)
 {
     bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier, ActivationLayerInfo(), dilation));
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
index 0e5024f6d3..d379ce728e 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/framework/Macros.h"
@@ -98,77 +98,43 @@ const auto data_type_values = framework::dataset::make("data_type", { DataType::
 
 /** Data layout values to test - All */
 const auto data_layout_values = framework::dataset::make("data_layout", { DataLayout::NHWC });
-
-/** Configuration test */
-void validate_configuration(size_t width_value, size_t height_value, size_t channel_value, size_t batch_value, Size2D kernel_sz_value, size_t depth_multiplier_value, Size2D dilation_value, Size2D stride_value, bool padding_valid_value, DataType data_type_value, DataLayout data_layout_value)
-{
-    TensorShape src_shape(width_value, height_value, channel_value, batch_value);
-    TensorShape weights_shape(kernel_sz_value.width, kernel_sz_value.height, channel_value * depth_multiplier_value);
-    TensorShape biases_shape(channel_value * depth_multiplier_value);
-
-    if(data_layout_value == DataLayout::NHWC)
-    {
-        permute(src_shape, PermutationVector(2U, 0U, 1U, 3U));
-        permute(weights_shape, PermutationVector(2U, 0U, 1U));
-    }
-
-    TensorInfo src_info(src_shape, 1, data_type_value);
-    TensorInfo weights_info(weights_shape, 1, data_type_value);
-    TensorInfo biases_info(biases_shape, 1, data_type_value);
-
-    src_info.set_data_layout(data_layout_value);
-    weights_info.set_data_layout(data_layout_value);
-    biases_info.set_data_layout(data_layout_value);
-
-    PadStrideInfo conv_info;
-    if(padding_valid_value)
-    {
-        conv_info = PadStrideInfo();
-    }
-    else
-    {
-        conv_info = calculate_same_pad(src_shape, weights_shape, PadStrideInfo(stride_value.width, stride_value.height), data_layout_value, dilation_value);
-    }
-
-    const TensorShape dst_shape = compute_depthwise_convolution_shape(src_info, weights_info, conv_info, depth_multiplier_value, dilation_value);
-
-    // Create tensors
-    Tensor src      = create_tensor<Tensor>(src_shape, data_type_value, 1, QuantizationInfo(), data_layout_value);
-    Tensor weights  = create_tensor<Tensor>(weights_shape, data_type_value, 1, QuantizationInfo(), data_layout_value);
-    Tensor biases   = create_tensor<Tensor>(biases_shape, data_type_value, 1, QuantizationInfo(), data_layout_value);
-    Tensor dst      = create_tensor<Tensor>(dst_shape, data_type_value, 1, QuantizationInfo(), data_layout_value);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(biases.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEDepthwiseConvolutionLayerNative dwc;
-    dwc.configure(&src, &weights, &biases, &dst, conv_info, depth_multiplier_value, dilation_value);
-}
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(DepthwiseConvolutionLayerNative)
-TEST_SUITE(Float)
-TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(width_values_precommit,
-                                                                                                                                           height_values_precommit),
-                                                                                                                                           channel_values_precommit),
-                                                                                                                                           batch_values_precommit),
-                                                                                                                                           kernel_sz_values_precommit),
-                                                                                                                                           depth_multiplier_values),
-                                                                                                                                           dilation_values),
-                                                                                                                                           stride_values),
-                                                                                                                                           padding_valid_values),
-                                                                                                                                           data_type_values),
-                                                                                                                                           data_layout_values),
-width_value, height_value, channel_value, batch_value, kernel_sz_value, depth_multiplier_value, dilation_value, stride_value, padding_valid_value, data_type_value, data_layout_value)
+
+TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL)
 {
-    validate_configuration(width_value, height_value, channel_value, batch_value, kernel_sz_value, depth_multiplier_value, dilation_value, stride_value, padding_valid_value, data_type_value, data_layout_value);
+    // this test case will ensure that the kernel is not adding implicit padding
+    constexpr uint32_t vector_size = 8; // Asummed vector size of the current native kernel
+    constexpr auto     depth = vector_size * 2 + 1; // mis-aligned depth to force padding if exists.
+    constexpr auto     data_layout = DataLayout::NHWC;
+    constexpr auto     data_type = DataType::F32;
+
+    const auto input_size  = Size2D{ 100, 100 }; // random plane size of the input
+    const auto kernel_size = Size2D{ 4, 4 }; // random plane size of the kernel
+    const auto pad_stride_info = PadStrideInfo(3, 3); // random convolution information to
+
+    TensorShape src_shape{ depth, input_size.x(), input_size.y() };
+    TensorShape weights_shape{ depth, kernel_size.x(), kernel_size.y() };
+    TensorShape bias_shape{ depth };
+
+    auto src     = create_tensor<Tensor>(src_shape, data_type, 1, QuantizationInfo(), data_layout);
+    auto weights = create_tensor<Tensor>(weights_shape, data_type, 1, QuantizationInfo(), data_layout);
+    auto biases  = create_tensor<Tensor>(bias_shape, data_type, 1, QuantizationInfo(), data_layout);
+    auto dst     = create_tensor<Tensor>(TensorShape(), data_type, 1, QuantizationInfo(), data_layout);
+
+    NEDepthwiseConvolutionLayerNativeKernel dwc;
+    dwc.configure(&src, &weights, &biases, &dst, pad_stride_info);
+
+    ARM_COMPUTE_EXPECT(src.info()->padding().empty(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(weights.info()->padding().empty(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(biases.info()->padding().empty(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->padding().empty(), framework::LogLevel::ERRORS);
 }
 
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerNativeFixture<float>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(width_values_precommit,
                                                                                                 height_values_precommit),
diff --git a/tests/validation/NEON/DequantizationLayer.cpp b/tests/validation/NEON/DequantizationLayer.cpp
index f4defcd4e0..bce60c7891 100644
--- a/tests/validation/NEON/DequantizationLayer.cpp
+++ b/tests/validation/NEON/DequantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,32 +117,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallShapes(), data_types),
-               shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::QASYMM8, 1, QuantizationInfo(0.5f, -10));
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEDequantizationLayer dequant_layer;
-    dequant_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 template <typename T>
 using NEDequantizationLayerFixture = DequantizationValidationFixture<Tensor, Accessor, NEDequantizationLayer, T>;
 
diff --git a/tests/validation/NEON/Derivative.cpp b/tests/validation/NEON/Derivative.cpp
index 0a047782ce..304ac824ed 100644
--- a/tests/validation/NEON/Derivative.cpp
+++ b/tests/validation/NEON/Derivative.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,42 +44,6 @@ TEST_SUITE(Derivative)
 
 using NEDerivativeFixture = DerivativeValidationFixture<Tensor, Accessor, NEDerivative, uint8_t, int16_t>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    Tensor src   = create_tensor<Tensor>(shape, data_type_from_format(format));
-    Tensor dst_x = create_tensor<Tensor>(shape, DataType::S16);
-    Tensor dst_y = create_tensor<Tensor>(shape, DataType::S16);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S16);
-    dst_y.info()->set_format(Format::S16);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create Derivative configure function
-    NEDerivative derivative;
-    derivative.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 1 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // TODO(COMPMID-415) Validate padding after fixing x-access input bug in NEON kernel
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, NEDerivativeFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
                                                                                                          Format::U8)),
                                                                                                  datasets::GradientDimensions()))
diff --git a/tests/validation/NEON/Dilate.cpp b/tests/validation/NEON/Dilate.cpp
index 668d2b01d7..9dc9dd57cd 100644
--- a/tests/validation/NEON/Dilate.cpp
+++ b/tests/validation/NEON/Dilate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,41 +50,6 @@ constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kerne
 TEST_SUITE(NEON)
 TEST_SUITE(Dilate)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEDilate dilate;
-    dilate.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEDilateFixture = DilateValidationFixture<Tensor, Accessor, NEDilate, T>;
 
diff --git a/tests/validation/NEON/DilatedConvolutionLayer.cpp b/tests/validation/NEON/DilatedConvolutionLayer.cpp
index 4c1e532e76..cf2f5f2ea4 100644
--- a/tests/validation/NEON/DilatedConvolutionLayer.cpp
+++ b/tests/validation/NEON/DilatedConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -108,49 +108,6 @@ TEST_SUITE_END() // DilatedConvolutionLayer
 
 TEST_SUITE(GEMMDilatedConvolutionLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallDilatedConvolutionLayerDataset(),
-                                                                   CNNDataTypes),
-               input_shape, weights_shape, bias_shape, output_shape, info, dilation, data_type)
-{
-    auto bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
-
-    // Create tensors
-    Tensor src     = create_tensor<Tensor>(input_shape, data_type, 1, QuantizationInfo(2.f / 255.f, 127));
-    Tensor weights = create_tensor<Tensor>(weights_shape, data_type, 1, QuantizationInfo(2.f / 255.f, 127));
-    Tensor bias    = create_tensor<Tensor>(bias_shape, bias_data_type, 1, QuantizationInfo(2.f / 255.f, 127));
-    Tensor dst     = create_tensor<Tensor>(output_shape, data_type, 1, QuantizationInfo(2.f / 255.f, 127));
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    const QuantizationInfo src_quantization_info     = src.info()->quantization_info();
-    const QuantizationInfo weights_quantization_info = weights.info()->quantization_info();
-
-    // Create and configure function
-    NEGEMMConvolutionLayer conv;
-    conv.configure(&src, &weights, &bias, &dst, info, WeightsInfo(), dilation);
-
-    // Validate valid region
-    const ValidRegion src_valid_region     = shape_to_valid_region(input_shape);
-    const ValidRegion weights_valid_region = shape_to_valid_region(weights_shape);
-    const ValidRegion bias_valid_region    = shape_to_valid_region(bias_shape);
-    const ValidRegion dst_valid_region     = shape_to_valid_region(output_shape);
-
-    validate(src.info()->valid_region(), src_valid_region);
-    validate(weights.info()->valid_region(), weights_valid_region);
-    validate(bias.info()->valid_region(), bias_valid_region);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate QuantizationInfo
-    ARM_COMPUTE_EXPECT(src.info()->quantization_info() == src_quantization_info, framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->quantization_info() == weights_quantization_info, framework::LogLevel::ERRORS);
-
-    // Validate padding
-    //TODO(COMPMID-415) Need to validate padding?
-}
-
 template <typename T>
 using NEGEMMDilatedConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T>;
 
diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index 88578ca586..afd9e3952f 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -78,12 +79,12 @@ const auto data_f16 = combine(datasets::SmallDirectConvolutionShapes(),
                                       combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
                                               data_pad_f16)));
 
-const auto data = combine(datasets::SmallDirectConvolutionShapes(),
-                          combine(framework::dataset::make("StrideX", { 1 }),
-                                  combine(framework::dataset::make("StrideY", { 1 }),
-                                          combine(framework::dataset::make("PadX", { 1 }),
-                                                  combine(framework::dataset::make("PadY", { 1 }),
-                                                          framework::dataset::make("KernelSize", 3))))));
+const auto data_prec = combine(datasets::SmallDirectConvolutionShapes(),
+                               combine(framework::dataset::make("StrideX", { 1 }),
+                                       combine(framework::dataset::make("StrideY", { 1 }),
+                                               combine(framework::dataset::make("PadX", { 1 }),
+                                                       combine(framework::dataset::make("PadY", { 1 }),
+                                                               framework::dataset::make("KernelSize", 3))))));
 
 const auto data9x9 = combine(datasets::SmallDirectConvolutionShapes(),
                              combine(framework::dataset::make("StrideX", { 1 }),
@@ -95,7 +96,7 @@ const auto data9x9 = combine(datasets::SmallDirectConvolutionShapes(),
 const auto data_f32_nightly = combine(data_f32, framework::dataset::make("NumKernels", { 1, 4 }));
 const auto data_f16_nightly = combine(data_f16, framework::dataset::make("NumKernels", { 1, 4 }));
 
-const auto data_precommit    = combine(data, framework::dataset::make("NumKernels", { 1 }));
+const auto data_precommit    = combine(data_prec, framework::dataset::make("NumKernels", { 1 }));
 const auto data_precommit9x9 = combine(data9x9, framework::dataset::make("NumKernels", { 4 }));
 
 /* The following tests is from real use-case that made DirectConvolution
@@ -195,7 +196,42 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-//TODO(COMPMID-415): Configuration tests?
+DATA_TEST_CASE(NoPaddingNHWCKernel, framework::DatasetMode::ALL, combine(combine(combine(data_precommit,
+                                                                                         framework::dataset::make("DataType", DataType::F32)),
+                                                                                 ActivationFunctionsDataset),
+                                                                         framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+
+               shape, stride_x, stride_y, pad_x, pad_y, kernel_size, num_kernels, data_type, act_info, data_layout)
+{
+    TensorShape         input_shape = TensorShape(shape);
+    TensorShape         weights_shape(kernel_size, kernel_size, input_shape.z(), num_kernels);
+    const PadStrideInfo info(stride_x, stride_y, pad_x, pad_y, DimensionRoundingType::FLOOR);
+
+    TensorInfo input_info   = TensorInfo(input_shape, 1, data_type);
+    TensorInfo weights_info = TensorInfo(weights_shape, 1, data_type);
+
+    TensorShape output_shape = compute_deep_convolution_shape(input_info, weights_info, info);
+
+    if(data_layout == DataLayout::NHWC)
+    {
+        permute(input_shape, PermutationVector(2U, 0U, 1U));
+        permute(weights_shape, PermutationVector(2U, 0U, 1U));
+        permute(output_shape, PermutationVector(2U, 0U, 1U));
+    }
+
+    // Create tensors
+    Tensor src     = create_tensor<Tensor>(input_shape, data_type, 1, QuantizationInfo(), data_layout);
+    Tensor weights = create_tensor<Tensor>(weights_shape, data_type, 1, QuantizationInfo(), data_layout);
+    Tensor dst     = create_tensor<Tensor>(output_shape, data_type, 1, QuantizationInfo(), data_layout);
+
+    // Create and configure function
+    NEDirectConvolutionLayer conv;
+    conv.configure(&src, &weights, nullptr, &dst, info, act_info);
+
+    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
+    validate(weights.info()->padding(), PaddingSize(0, 0, 0, 0));
+    validate(dst.info()->padding(), PaddingSize(0, 0, 0, 0));
+}
 
 template <typename T>
 using NEDirectConvolutionLayerFixture = DirectConvolutionValidationFixture<Tensor, Accessor, NEDirectConvolutionLayer, T>;
diff --git a/tests/validation/NEON/ElementwiseAbsoluteValue.cpp b/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
index 000a6de609..f135ba9913 100644
--- a/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
+++ b/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,26 +50,6 @@ RelativeTolerance<float> tolerance_fp16(0.01f);
 
 TEST_SUITE(NEON)
 TEST_SUITE(AbsLayer)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEAbsLayer neg_layer;
-    neg_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEAbsLayerFixture = AbsValidationFixture<Tensor, Accessor, NEAbsLayer, T>;
 
diff --git a/tests/validation/NEON/ElementwiseDivision.cpp b/tests/validation/NEON/ElementwiseDivision.cpp
index f6e0a65e84..db34af1c05 100644
--- a/tests/validation/NEON/ElementwiseDivision.cpp
+++ b/tests/validation/NEON/ElementwiseDivision.cpp
@@ -44,6 +44,9 @@ namespace
 {
 RelativeTolerance<float> tolerance_fp32(0.000001f);
 /** Input data sets **/
+const auto ElementwiseDivisionS32Dataset = combine(combine(framework::dataset::make("DataType", DataType::S32),
+                                                           framework::dataset::make("DataType", DataType::S32)),
+                                                   framework::dataset::make("DataType", DataType::S32));
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<half> tolerance_fp16(static_cast<half>(0.01f));
 const auto              ElementwiseDivisionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
@@ -100,23 +103,6 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()),
-               shape)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor dst      = create_tensor<Tensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    NEElementwiseDivision add;
-    add.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseDivisionFP32Dataset))
 {
     // Validate output
@@ -135,6 +121,16 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEElementwiseDivisionBroadcastFixture<
 TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(Integer)
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseDivisionFixture<int32_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseDivisionS32Dataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
+TEST_SUITE_END() // Integer
+
 TEST_SUITE_END() // ElementwiseDivision
 TEST_SUITE_END() // NEON
 } // namespace validation
diff --git a/tests/validation/NEON/ElementwiseExpLayer.cpp b/tests/validation/NEON/ElementwiseExpLayer.cpp
index 5b6f33ef96..3168b9ffe2 100644
--- a/tests/validation/NEON/ElementwiseExpLayer.cpp
+++ b/tests/validation/NEON/ElementwiseExpLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,25 +50,6 @@ RelativeTolerance<float> tolerance_fp16(0.01f);
 TEST_SUITE(NEON)
 TEST_SUITE(ExpLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEExpLayer exp_layer;
-    exp_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEExpLayerFixture = ExpValidationFixture<Tensor, Accessor, NEExpLayer, T>;
 
diff --git a/tests/validation/NEON/ElementwiseLog.cpp b/tests/validation/NEON/ElementwiseLog.cpp
index 4c5a35d4ac..81e6dc87e8 100644
--- a/tests/validation/NEON/ElementwiseLog.cpp
+++ b/tests/validation/NEON/ElementwiseLog.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,25 +50,6 @@ RelativeTolerance<float> tolerance_fp16(0.01f);
 TEST_SUITE(NEON)
 TEST_SUITE(LogLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NELogLayer log_layer;
-    log_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NELogLayerFixture = LogValidationFixture<Tensor, Accessor, NELogLayer, T>;
 
diff --git a/tests/validation/NEON/ElementwiseNegation.cpp b/tests/validation/NEON/ElementwiseNegation.cpp
index e121b13583..ae7dca1ef0 100644
--- a/tests/validation/NEON/ElementwiseNegation.cpp
+++ b/tests/validation/NEON/ElementwiseNegation.cpp
@@ -50,25 +50,6 @@ RelativeTolerance<float> tolerance_fp16(0.01f);
 TEST_SUITE(NEON)
 TEST_SUITE(NegLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NENegLayer neg_layer;
-    neg_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NENegLayerFixture = NegValidationInPlaceFixture<Tensor, Accessor, NENegLayer, T>;
 
diff --git a/tests/validation/NEON/ElementwisePower.cpp b/tests/validation/NEON/ElementwisePower.cpp
index bdca861c0a..beef1c874b 100644
--- a/tests/validation/NEON/ElementwisePower.cpp
+++ b/tests/validation/NEON/ElementwisePower.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,22 +100,6 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()),
-               shape)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor dst      = create_tensor<Tensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    NEElementwisePower power;
-    power.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
 
 FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwisePowerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwisePowerFP32Dataset))
 {
diff --git a/tests/validation/NEON/ElementwiseRound.cpp b/tests/validation/NEON/ElementwiseRound.cpp
index fc194342fe..e0f24128f6 100644
--- a/tests/validation/NEON/ElementwiseRound.cpp
+++ b/tests/validation/NEON/ElementwiseRound.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,25 +43,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(RoundLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NERoundLayer round_layer;
-    round_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NERoundLayerFixture = RoundValidationFixture<Tensor, Accessor, NERoundLayer, T>;
 
diff --git a/tests/validation/NEON/ElementwiseSin.cpp b/tests/validation/NEON/ElementwiseSin.cpp
index 2e93ce3ee9..a2e6cb3760 100644
--- a/tests/validation/NEON/ElementwiseSin.cpp
+++ b/tests/validation/NEON/ElementwiseSin.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,25 +50,6 @@ AbsoluteTolerance<float> tolerance_fp16(0.0005f);
 TEST_SUITE(NEON)
 TEST_SUITE(SinLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NESinLayer sin_layer;
-    sin_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NESinLayerFixture = SinValidationFixture<Tensor, Accessor, NESinLayer, T>;
 
diff --git a/tests/validation/NEON/ElementwiseSquareDiff.cpp b/tests/validation/NEON/ElementwiseSquareDiff.cpp
index e81edf77dc..b50db3d4a3 100644
--- a/tests/validation/NEON/ElementwiseSquareDiff.cpp
+++ b/tests/validation/NEON/ElementwiseSquareDiff.cpp
@@ -109,23 +109,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // *INDENT-ON*
 
 TEST_SUITE(S32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor dst      = create_tensor<Tensor>(shape, DataType::S32);
-
-    // Create and Configure function
-    NEElementwiseSquaredDiff add;
-    add.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), ElementwiseSquaredDiffS32Dataset))
 {
     // Validate output
@@ -134,23 +117,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int32_t>, frame
 TEST_SUITE_END() // S32
 
 TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::S16 })),
-               shape, data_type)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, data_type);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, DataType::S16);
-    Tensor dst      = create_tensor<Tensor>(shape, DataType::S16);
-
-    // Create and Configure function
-    NEElementwiseSquaredDiff add;
-    add.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<int16_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffS16Dataset))
 {
     // Validate output
@@ -163,23 +129,6 @@ using NEElementwiseSquaredDiffQuantizedFixture = ElementwiseSquaredDiffValidatio
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, DataType::QASYMM8);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, DataType::QASYMM8);
-    Tensor dst      = create_tensor<Tensor>(shape, DataType::QASYMM8);
-
-    // Create and Configure function
-    NEElementwiseMin add;
-    add.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
                        ElementwiseSquaredDiffQASYMM8Dataset),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })),
@@ -231,23 +180,6 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, datasets::SmallShapes(),
-               shape)
-{
-    // Create tensors
-    Tensor ref_src1 = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor ref_src2 = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor dst      = create_tensor<Tensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    NEElementwiseSquaredDiff add;
-    add.configure(&ref_src1, &ref_src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, NEElementwiseSquaredDiffFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), ElementwiseSquaredDiffFP32Dataset))
 {
     // Validate output
diff --git a/tests/validation/NEON/EqualizeHistogram.cpp b/tests/validation/NEON/EqualizeHistogram.cpp
index e1d3986930..b844c3a1df 100644
--- a/tests/validation/NEON/EqualizeHistogram.cpp
+++ b/tests/validation/NEON/EqualizeHistogram.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,31 +38,6 @@ namespace validation
 {
 TEST_SUITE(NEON)
 TEST_SUITE(EqualizeHistogram)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEEqualizeHistogram equalize_histogram;
-    equalize_histogram.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using NEEqualizeHistogramFixture = EqualizeHistogramValidationFixture<Tensor, Accessor, NEEqualizeHistogram, T>;
 
diff --git a/tests/validation/NEON/Erode.cpp b/tests/validation/NEON/Erode.cpp
index ff9c9270c5..67fb0fb1f7 100644
--- a/tests/validation/NEON/Erode.cpp
+++ b/tests/validation/NEON/Erode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,41 +50,6 @@ constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kerne
 TEST_SUITE(NEON)
 TEST_SUITE(Erode)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEErode erode;
-    erode.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEErodeFixture = ErodeValidationFixture<Tensor, Accessor, NEErode, T>;
 
diff --git a/tests/validation/NEON/FFT.cpp b/tests/validation/NEON/FFT.cpp
index 7f1c7c52b4..bc528dd9a6 100644
--- a/tests/validation/NEON/FFT.cpp
+++ b/tests/validation/NEON/FFT.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,30 +72,6 @@ constexpr float          tolerance_num = 0.07f; /**< Tolerance number */
 TEST_SUITE(NEON)
 TEST_SUITE(FFT1D)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(shapes_1d, data_types),
-               shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type, 2);
-    Tensor dst = create_tensor<Tensor>(shape, data_type, 2);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEFFT1D fft1d;
-    fft1d.configure(&src, &dst, FFT1DInfo());
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
@@ -140,31 +116,6 @@ TEST_SUITE_END() // Float
 TEST_SUITE_END() // FFT1D
 
 TEST_SUITE(FFT2D)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(shapes_2d, data_types),
-               shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type, 2);
-    Tensor dst = create_tensor<Tensor>(shape, data_type, 2);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEFFT2D fft2d;
-    fft2d.configure(&src, &dst, FFT2DInfo());
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
diff --git a/tests/validation/NEON/FastCorners.cpp b/tests/validation/NEON/FastCorners.cpp
index 389aa604ca..a7e0411f28 100644
--- a/tests/validation/NEON/FastCorners.cpp
+++ b/tests/validation/NEON/FastCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,8 +44,6 @@ namespace validation
 {
 namespace
 {
-/* Radius of the Bresenham circle around the candidate point */
-const unsigned int bresenham_radius = 3;
 /* Tolerance used to compare corner strengths */
 const AbsoluteTolerance<float> tolerance(0.5f);
 } // namespace
@@ -53,41 +51,6 @@ const AbsoluteTolerance<float> tolerance(0.5f);
 TEST_SUITE(NEON)
 TEST_SUITE(FastCorners)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()),
-                                                                                   framework::dataset::make("Format", Format::U8)),
-                                                                           framework::dataset::make("SuppressNonMax", { false, true })),
-                                                                   framework::dataset::make("BorderMode", BorderMode::UNDEFINED)),
-               shape, format, suppress_nonmax, border_mode)
-{
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    std::uniform_real_distribution<float>  real_dist(0, 255);
-
-    const uint8_t constant_border_value = int_dist(gen);
-    const float   threshold             = real_dist(gen);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type_from_format(format));
-    src.info()->set_format(format);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    KeyPointArray corners;
-
-    // Create and configure function
-    NEFastCorners fast_corners;
-    fast_corners.configure(&src, threshold, suppress_nonmax, &corners, border_mode, constant_border_value);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 1); // elems_processed
-
-    calculator.set_border_size(bresenham_radius);
-    calculator.set_access_offset(-bresenham_radius);
-    calculator.set_accessed_elements(8); // elems_read
-
-    validate(src.info()->padding(), calculator.required_padding());
-}
-
 template <typename T>
 using NEFastCornersFixture = FastCornersValidationFixture<Tensor, Accessor, KeyPointArray, NEFastCorners, T>;
 
diff --git a/tests/validation/NEON/FillBorder.cpp b/tests/validation/NEON/FillBorder.cpp
index b567b3f9b6..343ad831e4 100644
--- a/tests/validation/NEON/FillBorder.cpp
+++ b/tests/validation/NEON/FillBorder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "tests/Globals.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/datasets/BorderModeDataset.h"
diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp
index f817390780..2d8c61164b 100644
--- a/tests/validation/NEON/GEMM.cpp
+++ b/tests/validation/NEON/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/PaddingCalculator.h"
@@ -67,35 +68,123 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 const auto data_interleave = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12);
 const auto data_transpose  = framework::dataset::make("M", 8, 14) * framework::dataset::make("N", 7, 14);
 
+/** Zero padding test */
+template <typename FunctionType>
+bool validate_zero_padding(unsigned int dim0_value, unsigned int dim1_value)
+{
+    const TensorShape in_shape(dim0_value, dim1_value);
+
+    // Create tensors
+    Tensor in = create_tensor<Tensor>(in_shape, DataType::U32);
+    Tensor dst;
+
+    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Validate zero-padding
+    FunctionType func;
+
+    func.configure(&in, &dst);
+
+    return in.info()->padding().empty();
+}
+
+/* Zero padding test for GEMM kernels */
+bool validate_gemm_zero_padding(const TensorShape shape0, const TensorShape shape1)
+{
+    // Create tensors
+    Tensor in0 = create_tensor<Tensor>(shape0, DataType::F32);
+    Tensor in1 = create_tensor<Tensor>(shape1, DataType::F32);
+    Tensor dst;
+
+    // Validate zero-padding
+    NEGEMMMatrixMultiplyKernel gemm;
+    gemm.configure(&in0, &in1, &dst, 1.0, false);
+
+    return in0.info()->padding().empty() && in1.info()->padding().empty() && dst.info()->padding().empty();
+}
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(GEMM)
 
 TEST_SUITE(TRANSPOSE_1XW)
-using NEGEMMTranspose1xW        = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMTranspose1xWKernel, 4>;
-using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, float>;
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::F32))
+using NEGEMMTranspose1xW = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMTranspose1xWKernel, 4>;
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
+                   framework::dataset::make("N", { 1, 23, 63, 101 }),
+                   framework::dataset::make("K", { 1, 47, 29, 27 })),
+               n_value, k_value)
+{
+    bool status = validate_zero_padding<NEGEMMTranspose1xWKernel>(n_value, k_value);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE(U32)
+using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, uint32_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U32))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // FP32
+TEST_SUITE_END() // U32
+
+TEST_SUITE(U16)
+using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, uint16_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U16))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U16
+
+TEST_SUITE(U8)
+using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, uint8_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U8))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
 
 TEST_SUITE_END() // TRANSPOSE_1XW
 
 TEST_SUITE(INTERLEAVE_4X4)
 using NEGEMMInterleave4x4 = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMInterleave4x4Kernel, 4>;
 
-TEST_SUITE(FP32)
-using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, float>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::F32))
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
+                   framework::dataset::make("M", { 1, 23, 63, 101 }),
+                   framework::dataset::make("K", { 1, 47, 29, 27 })),
+               m_value, k_value)
+{
+    bool status = validate_zero_padding<NEGEMMInterleave4x4Kernel>(m_value, k_value);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE(U32)
+using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint32_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U32))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // FP32
+TEST_SUITE_END() // U32
+
+TEST_SUITE(U16)
+using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint16_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U16))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U16
+
+TEST_SUITE(U8)
+using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint8_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::QASYMM8))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
 
 TEST_SUITE_END() // INTERLEAVE_4X4
 
@@ -108,6 +197,26 @@ template <typename T>
 using NEGEMMFixtureDisabledC = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T, true>;
 
 TEST_SUITE(Float)
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(framework::dataset::make("In0", { TensorShape(21U, 13U),
+                                                                                                       TensorShape(31U, 1U),
+                                                                                                       TensorShape(31U, 1U),
+                                                                                                       TensorShape(8U, 2U),
+                                                                                                       TensorShape(38U, 12U),
+                                                                                                       TensorShape(32U, 1U)
+                                                                                                     }),
+                                                                     framework::dataset::make("In1", { TensorShape(33U, 21U),
+                                                                                                       TensorShape(23U, 31U),
+                                                                                                       TensorShape(23U, 31U),
+                                                                                                       TensorShape(16U, 8U),
+                                                                                                       TensorShape(21U, 38U),
+                                                                                                       TensorShape(17U, 32U)
+                                                                                                     })),
+               shape0, shape1)
+{
+    bool status = validate_gemm_zero_padding(shape0, shape1);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 579499dd4e..04282c2c3c 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -53,28 +52,6 @@ const auto data_matrix_multiply = framework::dataset::make("M", 12, 20) * framew
 } // namespace
 
 TEST_SUITE(NEON)
-TEST_SUITE(ASSEMBLY_MATRIX_MULTIPLY)
-
-using NEGEMMAssemblyFixture_S8 = GEMMLowpAssemblyFixture<Tensor, Accessor, NEGEMMLowpAssemblyMatrixMultiplyCore, int8_t>;
-using NEGEMMAssemblyFixture_U8 = GEMMLowpAssemblyFixture<Tensor, Accessor, NEGEMMLowpAssemblyMatrixMultiplyCore, uint8_t>;
-
-TEST_SUITE(S8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAssemblyFixture_S8, framework::DatasetMode::PRECOMMIT, data_matrix_multiply)
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAssemblyFixture_U8, framework::DatasetMode::PRECOMMIT, data_matrix_multiply)
-{
-    // Validate output
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-
 TEST_SUITE(GEMMLowp)
 TEST_SUITE(MatrixMultiplyCore)
 using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
@@ -97,6 +74,11 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::c
     // Create and configure function
     NEGEMMLowpMatrixMultiplyCore gemmlowp_mm;
     gemmlowp_mm.configure(&a, &b, nullptr, &c);
+
+    // Validate padding is zero
+    validate(a.info()->padding(), PaddingSize());
+    validate(b.info()->padding(), PaddingSize());
+    validate(c.info()->padding(), PaddingSize());
 }
 
 // *INDENT-OFF*
@@ -120,7 +102,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                             TensorInfo(TensorShape(8U, 11U), 1, DataType::S32),
                                             TensorInfo(TensorShape(64U, 32U), 1, DataType::S32),
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, true })),
+    framework::dataset::make("Expected", { true, false, false, false, true })),
     a_info, b_info, output_info, expected)
 {
     // Lock tensors
@@ -217,6 +199,27 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
+TEST_CASE(NoPaddingAdded, framework::DatasetMode::PRECOMMIT)
+{
+    Tensor input1 = create_tensor<Tensor>(TensorShape(21U, 13U), DataType::S32);
+    Tensor input2 = create_tensor<Tensor>(TensorShape(21U, 1U), DataType::S32);
+    Tensor output = create_tensor<Tensor>(TensorShape(21U, 13U), DataType::QASYMM8);
+
+    GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo();
+    output_stage.type                    = GEMMLowpOutputStageType::QUANTIZE_DOWN;
+    output_stage.gemmlowp_min_bound      = 0;
+    output_stage.gemmlowp_max_bound      = 205;
+    output_stage.output_data_type        = DataType::QASYMM8;
+
+    NEGEMMLowpOutputStage f;
+    f.configure(&input1, &input2, &output, output_stage);
+
+    // Validate padding is zero
+    validate(input1.info()->padding(), PaddingSize());
+    validate(input2.info()->padding(), PaddingSize());
+    validate(output.info()->padding(), PaddingSize());
+}
+
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
 {
     // Validate output
@@ -355,48 +358,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                                                                   quantize_down_int32_to_uint8_scale_by_fixedpoint_cases),
-               shape, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, add_bias)
-{
-    TensorShape shape_bias(shape[0]);
-
-    // Create tensors
-    Tensor in   = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor bias = create_tensor<Tensor>(shape_bias, DataType::S32);
-    Tensor out  = create_tensor<Tensor>(shape, DataType::QASYMM8);
-
-    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint output_stage;
-    output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-
-    // Validate valid region input and output
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(in.info()->valid_region(), valid_region);
-    validate(out.info()->valid_region(), valid_region);
-
-    // Validate valid region bias
-    if(add_bias)
-    {
-        const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
-        validate(bias.info()->valid_region(), valid_region_bias);
-    }
-
-    // Validate padding
-    const PaddingSize padding(0);
-    validate(in.info()->padding(), padding);
-    validate(out.info()->padding(), padding);
-
-    if(add_bias)
-    {
-        validate(bias.info()->padding(), padding);
-    }
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
                        quantize_down_int32_to_uint8_scale_by_fixedpoint_cases))
 {
@@ -479,47 +440,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                                                                   quantize_down_int32_to_int8_scale_by_fixedpoint_cases),
-               shape, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, add_bias)
-{
-    TensorShape shape_bias(shape[0]);
-
-    // Create tensors
-    Tensor in   = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor bias = create_tensor<Tensor>(shape_bias, DataType::S32);
-    Tensor out  = create_tensor<Tensor>(shape, DataType::QASYMM8_SIGNED);
-
-    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint output_stage;
-    output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
-
-    // Validate valid region input and output
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(in.info()->valid_region(), valid_region);
-    validate(out.info()->valid_region(), valid_region);
-
-    // Validate valid region bias
-    if(add_bias)
-    {
-        const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
-        validate(bias.info()->valid_region(), valid_region_bias);
-    }
-
-    // Validate padding
-    const PaddingSize padding(0);
-    validate(in.info()->padding(), padding);
-    validate(out.info()->padding(), padding);
-
-    if(add_bias)
-    {
-        validate(bias.info()->padding(), padding);
-    }
-}
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
                        quantize_down_int32_to_int8_scale_by_fixedpoint_cases))
 {
@@ -593,47 +513,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
-                                                                   quantize_down_int32_to_int16_scale_by_fixedpoint_cases),
-               shape, result_fixedpoint_multiplier, result_shift, min, max, add_bias)
-{
-    TensorShape shape_bias(shape[0]);
-
-    // Create tensors
-    Tensor in   = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor bias = create_tensor<Tensor>(shape_bias, DataType::S32);
-    Tensor out  = create_tensor<Tensor>(shape, DataType::QSYMM16);
-
-    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint output_stage;
-    output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_fixedpoint_multiplier, result_shift, min, max);
-
-    // Validate valid region input and output
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(in.info()->valid_region(), valid_region);
-    validate(out.info()->valid_region(), valid_region);
-
-    // Validate valid region bias
-    if(add_bias)
-    {
-        const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
-        validate(bias.info()->valid_region(), valid_region_bias);
-    }
-
-    // Validate padding
-    const PaddingSize padding(0);
-    validate(in.info()->padding(), padding);
-    validate(out.info()->padding(), padding);
-
-    if(add_bias)
-    {
-        validate(bias.info()->padding(), padding);
-    }
-}
 TEST_SUITE(NoRelu)
 TEST_SUITE(MultSmallerEq1)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(),
diff --git a/tests/validation/NEON/Gather.cpp b/tests/validation/NEON/Gather.cpp
index af534ba772..d2d5df802a 100644
--- a/tests/validation/NEON/Gather.cpp
+++ b/tests/validation/NEON/Gather.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,26 +97,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(arm_compute::test::datasets::SmallGatherDataset(), framework::dataset::make("DataType", { DataType::F32 })),
-               input_shape, indices_shape, axis, data_type)
-{
-    const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input_shape.num_dimensions()));
-    Tensor         src         = create_tensor<Tensor>(input_shape, data_type);
-    Tensor         indices     = create_tensor<Tensor>(indices_shape, DataType::U32);
-    TensorShape    dst_shape   = arm_compute::misc::shape_calculator::compute_gather_shape(input_shape, indices_shape, actual_axis);
-    Tensor         dst         = create_tensor<Tensor>(dst_shape, data_type);
-
-    // Create and Configure function
-    NEGather gather;
-    gather.configure(&src, &indices, &dst, axis);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEGatherFixture = GatherFixture<Tensor, Accessor, NEGather, T>;
 
diff --git a/tests/validation/NEON/Gaussian3x3.cpp b/tests/validation/NEON/Gaussian3x3.cpp
index 7396be7845..bcd9e0259a 100644
--- a/tests/validation/NEON/Gaussian3x3.cpp
+++ b/tests/validation/NEON/Gaussian3x3.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,41 +50,6 @@ constexpr BorderSize   border_size(filter_size / 2); /** Border size of the kern
 TEST_SUITE(NEON)
 TEST_SUITE(Gaussian3x3)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEGaussian3x3 gaussian3x3;
-    gaussian3x3.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEGaussian3x3Fixture = Gaussian3x3ValidationFixture<Tensor, Accessor, NEGaussian3x3, T>;
 
diff --git a/tests/validation/NEON/Gaussian5x5.cpp b/tests/validation/NEON/Gaussian5x5.cpp
index 6c4c480ee9..9b5ae401b0 100644
--- a/tests/validation/NEON/Gaussian5x5.cpp
+++ b/tests/validation/NEON/Gaussian5x5.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,41 +50,6 @@ constexpr BorderSize   border_size(filter_size / 2); /** Border size of the kern
 TEST_SUITE(NEON)
 TEST_SUITE(Gaussian5x5)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEGaussian5x5 gaussian5x5;
-    gaussian5x5.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 16);
-    calculator.set_border_size(2);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_processed_elements(8);
-    calculator.set_access_offset(-2);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEGaussian5x5Fixture = Gaussian5x5ValidationFixture<Tensor, Accessor, NEGaussian5x5, T>;
 
diff --git a/tests/validation/NEON/GaussianPyramid.cpp b/tests/validation/NEON/GaussianPyramid.cpp
index ed8e43cea9..a6e6f43f8a 100644
--- a/tests/validation/NEON/GaussianPyramid.cpp
+++ b/tests/validation/NEON/GaussianPyramid.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -20,7 +20,7 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
-*/
+ */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -69,27 +69,6 @@ TEST_SUITE(NEON)
 TEST_SUITE(GaussianPyramid)
 TEST_SUITE(Half)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, large_gaussian_pyramid_levels,
-               shape, border_mode, num_levels)
-{
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-
-    // Create pyramid
-    PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, shape, Format::U8);
-    Pyramid     dst;
-    dst.init(pyramid_info);
-
-    NEGaussianPyramidHalf gaussian_pyramid_half;
-    gaussian_pyramid_half.configure(&src, &dst, border_mode, 0);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    for(size_t level = 0; level < pyramid_info.num_levels(); ++level)
-    {
-        ARM_COMPUTE_EXPECT(dst.get_pyramid_level(level)->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-}
-
 template <typename T>
 using NEGaussianPyramidHalfFixture = GaussianPyramidHalfValidationFixture<Tensor, Accessor, NEGaussianPyramidHalf, T, Pyramid>;
 
diff --git a/tests/validation/NEON/HarrisCorners.cpp b/tests/validation/NEON/HarrisCorners.cpp
index e4c0827bb8..d88cad6564 100644
--- a/tests/validation/NEON/HarrisCorners.cpp
+++ b/tests/validation/NEON/HarrisCorners.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,47 +56,6 @@ const auto data = combine(framework::dataset::make("GradientSize", { 3, 5, 7 }),
 TEST_SUITE(NEON)
 TEST_SUITE(HarrisCorners)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), data), framework::dataset::make("Format", Format::U8)), shape,
-               gradient_size, block_size, border_mode, format)
-{
-    std::mt19937                          gen(library->seed());
-    std::uniform_real_distribution<float> real_dist(0.f, 0.01f);
-
-    const float threshold   = real_dist(gen);
-    const float sensitivity = real_dist(gen);
-
-    constexpr float max_euclidean_distance = 30.f;
-    real_dist                              = std::uniform_real_distribution<float>(0.f, max_euclidean_distance);
-    const float min_dist                   = real_dist(gen);
-
-    // Generate a random constant value
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type_from_format(format));
-    src.info()->set_format(format);
-    KeyPointArray corners;
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create harris corners configure function
-    NEHarrisCorners harris_corners;
-    harris_corners.configure(&src, threshold, min_dist, sensitivity, gradient_size, block_size, &corners, border_mode, constant_border_value);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(gradient_size / 2);
-    calculator.set_access_offset(-gradient_size / 2);
-    calculator.set_accessed_elements(16);
-
-    const PaddingSize padding = calculator.required_padding();
-
-    validate(src.info()->padding(), padding);
-}
-
 template <typename T>
 using NEHarrisCornersFixture = HarrisCornersValidationFixture<Tensor, Accessor, KeyPointArray, NEHarrisCorners, T>;
 
diff --git a/tests/validation/NEON/Histogram.cpp b/tests/validation/NEON/Histogram.cpp
index 03b2e2b705..cd113857f3 100644
--- a/tests/validation/NEON/Histogram.cpp
+++ b/tests/validation/NEON/Histogram.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,44 +45,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(Histogram)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(),
-                                                                   framework::dataset::make("DataType", DataType::U8)),
-               shape, data_type)
-{
-    // Setup Distribution
-    std::mt19937                            gen(library->seed());
-    std::uniform_int_distribution<size_t>   distribution_size_t(1, 30);
-    const size_t                            num_bins = distribution_size_t(gen);
-    std::uniform_int_distribution<int32_t>  distribution_int32_t(0, 125);
-    const size_t                            offset = distribution_int32_t(gen);
-    std::uniform_int_distribution<uint32_t> distribution_uint32_t(1, 255 - offset);
-    const size_t                            range = distribution_uint32_t(gen);
-    Distribution1D                          distribution_dst(num_bins, offset, range);
-
-    // Create tensors
-    Tensor      src = create_tensor<Tensor>(shape, data_type);
-    TensorShape dst_shape(num_bins);
-    Tensor      dst = create_tensor<Tensor>(dst_shape, DataType::U32);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEHistogram histogram;
-    histogram.configure(&src, &distribution_dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    const ValidRegion valid_region_dst = shape_to_valid_region(dst_shape);
-    validate(dst.info()->valid_region(), valid_region_dst);
-
-    // Validate padding
-    const PaddingSize padding;
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using NEHistogramFixture = HistogramValidationFixture<Tensor, Accessor, NEHistogram, T, Distribution1D>;
 
diff --git a/tests/validation/NEON/IntegralImage.cpp b/tests/validation/NEON/IntegralImage.cpp
index 2a8aa956b7..14e7df7152 100644
--- a/tests/validation/NEON/IntegralImage.cpp
+++ b/tests/validation/NEON/IntegralImage.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,31 +38,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(IntegralImage)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, DataType::U32);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEIntegralImage integral_image;
-    integral_image.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize src_padding = PaddingCalculator(shape.x(), 16).required_padding();
-    const PaddingSize dst_padding(1, src_padding.right, 0, 1);
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEIntegralImageFixture = IntegralImageValidationFixture<Tensor, Accessor, NEIntegralImage, T>;
 
diff --git a/tests/validation/NEON/L2NormalizeLayer.cpp b/tests/validation/NEON/L2NormalizeLayer.cpp
index 37146f4015..82e4beb05a 100644
--- a/tests/validation/NEON/L2NormalizeLayer.cpp
+++ b/tests/validation/NEON/L2NormalizeLayer.cpp
@@ -100,7 +100,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                        framework::dataset::make("Axis", { -1, 0, 1, 2 })),
-                               framework::dataset::make("Epsilon", { 1e-12 })))
+                               framework::dataset::make("Epsilon", { 1e-6 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -109,7 +109,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<float>, framework::Da
 FIXTURE_DATA_TEST_CASE(RunLarge, NEL2NormalizeLayerFixture<float>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                        framework::dataset::make("Axis", { -1, 0, 2 })),
-                               framework::dataset::make("Epsilon", { 1e-12 })))
+                               framework::dataset::make("Epsilon", { 1e-6 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -121,7 +121,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                        framework::dataset::make("Axis", { -1, 0, 1, 2 })),
-                               framework::dataset::make("Epsilon", { 1e-12 })))
+                               framework::dataset::make("Epsilon", { 1e-6 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -130,7 +130,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<half>, framework::Dat
 FIXTURE_DATA_TEST_CASE(RunLarge, NEL2NormalizeLayerFixture<half>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                        framework::dataset::make("Axis", { -1, 0, 2 })),
-                               framework::dataset::make("Epsilon", { 1e-12 })))
+                               framework::dataset::make("Epsilon", { 1e-6 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/NEON/LaplacianPyramid.cpp b/tests/validation/NEON/LaplacianPyramid.cpp
index 0c03c70445..5ddd0e750f 100644
--- a/tests/validation/NEON/LaplacianPyramid.cpp
+++ b/tests/validation/NEON/LaplacianPyramid.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -20,7 +20,7 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
-*/
+ */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -77,39 +77,6 @@ TEST_SUITE(LaplacianPyramid)
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(
-                                                           concat(datasets::Medium2DShapes(), datasets::Large2DShapes()),
-                                                           datasets::BorderModes()),
-                                                           large_laplacian_pyramid_levels),
-                                                           shape, border_mode, num_levels)
-{
-    // Create pyramid info
-    PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, shape, Format::S16);
-    Pyramid     dst_pyramid{};
-    dst_pyramid.init(pyramid_info);
-
-    // Create Tensors
-    Tensor src = create_tensor<Tensor>(shape, Format::U8);
-
-    // The first two dimensions of the output tensor must match the first two
-    // dimensions of the tensor in the last level of the pyramid
-    TensorShape dst_shape(shape);
-    dst_shape.set(0, dst_pyramid.get_pyramid_level(num_levels - 1)->info()->dimension(0));
-    dst_shape.set(1, dst_pyramid.get_pyramid_level(num_levels - 1)->info()->dimension(1));
-    Tensor dst = create_tensor<Tensor>(dst_shape, Format::S16);
-
-    // Create and configure function
-    NELaplacianPyramid laplacian_pyramid;
-    laplacian_pyramid.configure(&src, &dst_pyramid, &dst, border_mode, 0);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    for(size_t level = 0; level < pyramid_info.num_levels(); ++level)
-    {
-        ARM_COMPUTE_EXPECT(dst_pyramid.get_pyramid_level(level)->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-}
 
 using NELaplacianPyramidFixture = LaplacianPyramidValidationFixture<Tensor, Accessor, NELaplacianPyramid, uint8_t, int16_t, Pyramid>;
 
diff --git a/tests/validation/NEON/LaplacianReconstruct.cpp b/tests/validation/NEON/LaplacianReconstruct.cpp
index bc1151f700..e407ea0a22 100644
--- a/tests/validation/NEON/LaplacianReconstruct.cpp
+++ b/tests/validation/NEON/LaplacianReconstruct.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -20,7 +20,7 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
-*/
+ */
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
 #include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
@@ -68,48 +68,6 @@ TEST_SUITE(LaplacianReconstruct)
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(
-                                                           concat(datasets::Medium2DShapes(), datasets::Large2DShapes()),
-                                                           datasets::BorderModes()),
-                                                           large_laplacian_reconstruct_levels),
-                                                           shape, border_mode, num_levels)
-{
-    // Create pyramid info
-    PyramidInfo pyramid_info(num_levels, SCALE_PYRAMID_HALF, shape, Format::S16);
-    Pyramid   dst_pyramid{};
-    dst_pyramid.init(pyramid_info);
-
-    // Create Tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-
-    // The first two dimensions of the output tensor must match the first two
-    // dimensions of the tensor in the last level of the pyramid
-    TensorShape dst_shape(shape);
-    dst_shape.set(0, dst_pyramid.get_pyramid_level(num_levels - 1)->info()->dimension(0));
-    dst_shape.set(1, dst_pyramid.get_pyramid_level(num_levels - 1)->info()->dimension(1));
-    Tensor dst = create_tensor<Tensor>(dst_shape, DataType::S16);
-
-    // The dimensions of the reconstruct are the same as the src shape
-    Tensor rec_dst = create_tensor<Tensor>(shape, DataType::U8);
-
-    // Create and configure pyramid function
-    NELaplacianPyramid laplacian_pyramid;
-    laplacian_pyramid.configure(&src, &dst_pyramid, &dst, border_mode, 0);
-
-    // Create and configure reconstruct function
-    NELaplacianReconstruct laplacian_reconstruct;
-    laplacian_reconstruct.configure(&dst_pyramid, &dst, &rec_dst, border_mode, 0);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    for(size_t level = 0; level < pyramid_info.num_levels(); ++level)
-    {
-        ARM_COMPUTE_EXPECT(dst_pyramid.get_pyramid_level(level)->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-
-    ARM_COMPUTE_EXPECT(rec_dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-}
 
 using NELaplacianReconstructFixture = LaplacianReconstructValidationFixture<Tensor, Accessor, NELaplacianReconstruct, NELaplacianPyramid, int16_t, uint8_t, Pyramid>;
 
diff --git a/tests/validation/NEON/LocallyConnected.cpp b/tests/validation/NEON/LocallyConnected.cpp
deleted file mode 100644
index 37c77520a0..0000000000
--- a/tests/validation/NEON/LocallyConnected.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
-#include "tests/datasets/LocallyConnectedDataset.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/fixtures/LocallyConnectedFixture.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace
-{
-constexpr RelativeTolerance<float> tolerance_f32(0.0001f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
-} // namespace
-
-TEST_SUITE(NEON)
-TEST_SUITE(LocallyConnected)
-
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
-    framework::dataset::make("InputInfo",  { TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching data type input/weights
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching data type input/bias
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching data type input/output
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/weights
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/bias
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/output
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Asymmetric padding
-                                             TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32)
-                                           }),
-    framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F16),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 274U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32)
-                                           })),
-    framework::dataset::make("BiasInfo",   { TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F16),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 274U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(21U, 275U), 1, DataType::F32)
-                                           })),
-    framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F16),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 22U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
-                                             TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32)
-                                           })),
-    framework::dataset::make("PadStride",  { PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 0, 0),
-                                             PadStrideInfo(2, 1, 1, 0, 0, 0, DimensionRoundingType::FLOOR),
-                                             PadStrideInfo(2, 1, 0, 0)
-                                           })),
-    framework::dataset::make("Expected", { false, false, false, false, false, false, false, true })),
-    input_info, weights_info, bias_info, output_info, conv_info, expected)
-{
-    bool is_valid = bool(NELocallyConnectedLayer::validate(&input_info.clone()->set_is_resizable(false),
-                                                           &weights_info.clone()->set_is_resizable(false),
-                                                           &bias_info.clone()->set_is_resizable(false),
-                                                           &output_info.clone()->set_is_resizable(false),
-                                                           conv_info));
-    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallLocallyConnectedDataset(), datasets::LargeLocallyConnectedDataset()),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-               src_shape, weights_shape, bias_shape, dst_shape, info, dilation, data_type)
-{
-    ARM_COMPUTE_UNUSED(dilation);
-
-    // Create tensors
-    Tensor src     = create_tensor<Tensor>(src_shape, data_type);
-    Tensor weights = create_tensor<Tensor>(weights_shape, data_type);
-    Tensor bias    = create_tensor<Tensor>(bias_shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(dst_shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function.
-    NELocallyConnectedLayer lc;
-    lc.configure(&src, &weights, &bias, &dst, info);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(dst_shape);
-    validate(dst.info()->valid_region(), dst_valid_region);
-}
-
-template <typename T>
-using NELocallyConnectedFixture = LocallyConnectedValidationFixture<Tensor, Accessor, NELocallyConnectedLayer, T>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NELocallyConnectedFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallLocallyConnectedDataset(),
-                                                                                                              framework::dataset::make("DataType",
-                                                                                                                      DataType::F32)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(RunLarge, NELocallyConnectedFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeLocallyConnectedDataset(),
-                                                                                                              framework::dataset::make("DataType",
-                                                                                                                      DataType::F32)))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f32);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/NEON/LogSoftmaxLayer.cpp b/tests/validation/NEON/LogSoftmaxLayer.cpp
index 3f85e3f7a2..a7ab033359 100644
--- a/tests/validation/NEON/LogSoftmaxLayer.cpp
+++ b/tests/validation/NEON/LogSoftmaxLayer.cpp
@@ -71,7 +71,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F16)),
                                                                                                                     framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0 })))
+                                                                                                            framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -79,7 +79,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NELogSoftmaxLayerFixture<half>, framework::Data
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
                                                                                                                       framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                              framework::dataset::make("Axis", { 0 })))
+                                                                                                              framework::dataset::make("Axis", { 0, -3, 2 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -99,7 +99,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
                                                                                                                        framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                               framework::dataset::make("Axis", { 0 })))
+                                                                                                               framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -107,7 +107,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerFixture<float>, framework::D
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
                                                                                                                        framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                               framework::dataset::make("Axis", { 0 })))
+                                                                                                               framework::dataset::make("Axis", { 0, 2, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -132,7 +132,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerQuantizedFixture<uint8_t>, f
                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                     combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                    framework::dataset::make("Axis", { 0 })))
+                                                                                                                    framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -141,7 +141,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerQuantizedFixture<uint8_t>, f
                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                     combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                    framework::dataset::make("Axis", { 0 })))
+                                                                                                                    framework::dataset::make("Axis", { 0, -1, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
diff --git a/tests/validation/NEON/Logical.cpp b/tests/validation/NEON/Logical.cpp
new file mode 100644
index 0000000000..6f1c55b33c
--- /dev/null
+++ b/tests/validation/NEON/Logical.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NELogical.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/LogicalFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(NEON)
+
+TEST_SUITE(LogicalAnd)
+template <typename T>
+using NELogicalAndFixture = LogicalAndValidationFixture<Tensor, Accessor, NELogicalAnd, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogicalAndFixture<uint8_t>, framework::DatasetMode::ALL, zip(datasets::SmallShapes(), datasets::SmallShapes()))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NELogicalAndFixture<uint8_t>, framework::DatasetMode::ALL, datasets::SmallShapesBroadcast())
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // LogicalAnd
+
+TEST_SUITE(LogicalOr)
+template <typename T>
+using NELogicalOrFixture = LogicalOrValidationFixture<Tensor, Accessor, NELogicalOr, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogicalOrFixture<uint8_t>, framework::DatasetMode::ALL, zip(datasets::SmallShapes(), datasets::SmallShapes()))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NELogicalOrFixture<uint8_t>, framework::DatasetMode::ALL, datasets::SmallShapesBroadcast())
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // LogicalOr
+
+TEST_SUITE(LogicalNot)
+
+template <typename T>
+using NELogicalNotFixture = LogicalNotValidationFixture<Tensor, Accessor, NELogicalNot, T>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogicalNotFixture<uint8_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                    DataType::U8)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // LogicalNot
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/validation/NEON/Magnitude.cpp b/tests/validation/NEON/Magnitude.cpp
index e14b32a89e..9ba119b234 100644
--- a/tests/validation/NEON/Magnitude.cpp
+++ b/tests/validation/NEON/Magnitude.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,32 +49,6 @@ AbsoluteTolerance<T> tolerance(MagnitudeType magnitude_type)
 TEST_SUITE(NEON)
 TEST_SUITE(Magnitude)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::S16)), shape, data_type)
-{
-    // Create tensors
-    Tensor src1 = create_tensor<Tensor>(shape, data_type);
-    Tensor src2 = create_tensor<Tensor>(shape, data_type);
-    Tensor dst  = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function (default MagnitudeType::L2NORM)
-    NEMagnitude magnitude;
-    magnitude.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using NEMagnitudeFixture = MagnitudeValidationFixture<Tensor, Accessor, NEMagnitude, T>;
 
diff --git a/tests/validation/NEON/MeanStdDev.cpp b/tests/validation/NEON/MeanStdDev.cpp
index a10939680c..d688719331 100644
--- a/tests/validation/NEON/MeanStdDev.cpp
+++ b/tests/validation/NEON/MeanStdDev.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,26 +44,6 @@ RelativeTolerance<float> tolerance_rel_low_error(0.0005f);
 TEST_SUITE(NEON)
 TEST_SUITE(MeanStdDev)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-
-    // Create output variables
-    float mean    = 0.f;
-    float std_dev = 0.f;
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create configure function
-    NEMeanStdDev mean_std_dev_image;
-    mean_std_dev_image.configure(&src, &mean, &std_dev);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-}
-
 template <typename T>
 using NEMeanStdDevFixture = MeanStdDevValidationFixture<Tensor, Accessor, NEMeanStdDev, T>;
 
diff --git a/tests/validation/NEON/Median3x3.cpp b/tests/validation/NEON/Median3x3.cpp
index 1924a448d6..f22a27713b 100644
--- a/tests/validation/NEON/Median3x3.cpp
+++ b/tests/validation/NEON/Median3x3.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,41 +50,6 @@ constexpr BorderSize   border_size(filter_size / 2); /* Border size of the kerne
 TEST_SUITE(NEON)
 TEST_SUITE(Median3x3)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                   datasets::BorderModes()),
-               shape, data_type, border_mode)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEMedian3x3 median3x3;
-    median3x3.configure(&src, &dst, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, (border_mode == BorderMode::UNDEFINED), border_size);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-    calculator.set_border_size(1);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEMedian3x3Fixture = Median3x3ValidationFixture<Tensor, Accessor, NEMedian3x3, T>;
 
diff --git a/tests/validation/NEON/MinMaxLocation.cpp b/tests/validation/NEON/MinMaxLocation.cpp
index 973ea930bb..553159bc7c 100644
--- a/tests/validation/NEON/MinMaxLocation.cpp
+++ b/tests/validation/NEON/MinMaxLocation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,34 +43,7 @@ TEST_SUITE(MinMaxLocation)
 template <typename T>
 using NEMinMaxLocationFixture = MinMaxLocationValidationFixture<Tensor, Accessor, Array<Coordinates2D>, ArrayAccessor<Coordinates2D>, NEMinMaxLocation, T>;
 
-void validate_configuration(const Tensor &src, TensorShape shape)
-{
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create output storage
-    int32_t            min{};
-    int32_t            max{};
-    Coordinates2DArray min_loc(shape.total_size());
-    Coordinates2DArray max_loc(shape.total_size());
-
-    // Create and configure function
-    NEMinMaxLocation min_max_loc;
-    min_max_loc.configure(&src, &min, &max, &min_loc, &max_loc);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 1).required_padding();
-    validate(src.info()->padding(), padding);
-}
-
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::U8)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    src.info()->set_format(Format::U8);
-
-    validate_configuration(src, shape);
-}
 
 FIXTURE_DATA_TEST_CASE(RunSmall, NEMinMaxLocationFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
                                                                                                               DataType::U8)))
@@ -87,14 +60,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEMinMaxLocationFixture<uint8_t>, framework::Da
 TEST_SUITE_END() // U8
 
 TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::S16)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    src.info()->set_format(Format::S16);
-
-    validate_configuration(src, shape);
-}
 
 FIXTURE_DATA_TEST_CASE(RunSmall, NEMinMaxLocationFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
                                                                                                               DataType::S16)))
@@ -111,14 +76,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEMinMaxLocationFixture<int16_t>, framework::Da
 TEST_SUITE_END() // S16
 
 TEST_SUITE(Float)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small2DShapes(), framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    src.info()->set_format(Format::F32);
-
-    validate_configuration(src, shape);
-}
 
 FIXTURE_DATA_TEST_CASE(RunSmall, NEMinMaxLocationFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
                                                                                                             DataType::F32)))
diff --git a/tests/validation/NEON/NonLinearFilter.cpp b/tests/validation/NEON/NonLinearFilter.cpp
index c54394d3a1..5074b028a9 100644
--- a/tests/validation/NEON/NonLinearFilter.cpp
+++ b/tests/validation/NEON/NonLinearFilter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,52 +41,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(NonLinearFilter)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), datasets::NonLinearFilterFunctions()),
-                                                                                   framework::dataset::make("MaskSize", { 3U, 5U })),
-                                                                           datasets::MatrixPatterns()),
-                                                                   datasets::BorderModes()),
-               shape, function, mask_size, pattern, border_mode)
-{
-    std::mt19937                           generator(library->seed());
-    std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-    const uint8_t                          constant_border_value = distribution_u8(generator);
-
-    // Create the mask
-    std::vector<uint8_t> mask(mask_size * mask_size);
-    fill_mask_from_pattern(mask.data(), mask_size, mask_size, pattern);
-    const auto half_mask_size = static_cast<int>(mask_size / 2);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, DataType::U8);
-    Tensor dst = create_tensor<Tensor>(shape, DataType::U8);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NENonLinearFilter filter;
-    filter.configure(&src, &dst, function, mask_size, pattern, mask.data(), border_mode, constant_border_value);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, BorderSize(half_mask_size));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), ((MatrixPattern::OTHER == pattern) ? 1 : 8));
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(half_mask_size);
-
-    const PaddingSize write_padding = calculator.required_padding(PaddingCalculator::Option::EXCLUDE_BORDER);
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-half_mask_size);
-
-    const PaddingSize read_padding = calculator.required_padding(PaddingCalculator::Option::INCLUDE_BORDER);
-
-    validate(src.info()->padding(), read_padding);
-    validate(dst.info()->padding(), write_padding);
-}
-
 template <typename T>
 using NENonLinearFilterFixture = NonLinearFilterValidationFixture<Tensor, Accessor, NENonLinearFilter, T>;
 
diff --git a/tests/validation/NEON/NormalizationLayer.cpp b/tests/validation/NEON/NormalizationLayer.cpp
index 255a68df05..d910cbdc45 100644
--- a/tests/validation/NEON/NormalizationLayer.cpp
+++ b/tests/validation/NEON/NormalizationLayer.cpp
@@ -91,41 +91,15 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)),
-               shape, data_type)
-{
-    NormalizationLayerInfo info(NormType::IN_MAP_1D, 3U, 5.0f, 2.0f, 1.f, false);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NENormalizationLayer norm;
-    norm.configure(&src, &dst, info);
-
-    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
-}
-
 template <typename T>
 using NENormalizationLayerFixture = NormalizationValidationFixture<Tensor, Accessor, NENormalizationLayer, T>;
 
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NENormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(NormalizationDataset,
-                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
-                                                                                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_f16);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NENormalizationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(NormalizationDataset,
-                                                                                                                     framework::dataset::make("DataType", DataType::F16)),
-                                                                                                             framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NENormalizationLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(NormalizationDataset,
+                                                                                                                 framework::dataset::make("DataType", DataType::F16)),
+                                                                                                         framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/NEON/Permute.cpp b/tests/validation/NEON/Permute.cpp
index d405582192..d897bbbe07 100644
--- a/tests/validation/NEON/Permute.cpp
+++ b/tests/validation/NEON/Permute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,11 @@ namespace validation
 {
 namespace
 {
+const auto PermuteVectors2 = framework::dataset::make("PermutationVector",
+{
+    PermutationVector(0U, 1U),
+    PermutationVector(1U, 0U),
+});
 const auto PermuteVectors3 = framework::dataset::make("PermutationVector",
 {
     PermutationVector(2U, 0U, 1U),
@@ -61,7 +66,7 @@ const auto PermuteVectors4 = framework::dataset::make("PermutationVector",
     PermutationVector(3U, 0U, 2U, 1U),
     PermutationVector(0U, 3U, 2U, 1U)
 });
-const auto PermuteVectors         = concat(PermuteVectors3, PermuteVectors4);
+const auto PermuteVectors         = concat(concat(PermuteVectors2, PermuteVectors3), PermuteVectors4);
 const auto PermuteParametersSmall = concat(concat(datasets::Small2DShapes(), datasets::Small3DShapes()), datasets::Small4DShapes()) * PermuteVectors;
 const auto PermuteParametersLarge = datasets::Large4DShapes() * PermuteVectors;
 } // namespace
@@ -71,7 +76,7 @@ TEST_SUITE(Permute)
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-                                                framework::dataset::make("InputInfo",{  
+                                                framework::dataset::make("InputInfo",{
                                                                                         TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
                                                                                         TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
                                                                                         TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
@@ -85,26 +90,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                                                         TensorInfo(TensorShape(27U, 13U, 37U, 2U), 1, DataType::F32)  // permutation not supported
 
                                                                                     }),
-                                                framework::dataset::make("OutputInfo", { 
-                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),     
-                                                                                        TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     
+                                                framework::dataset::make("OutputInfo", {
+                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
+                                                                                        TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),
                                                                                         TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),
                                                                                         TensorInfo(TensorShape(5U, 7U), 1, DataType::U8),
-                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16), 
-                                                                                        TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32),  
-                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16), 
-                                                                                        TensorInfo(TensorShape(3U, 5U, 7U, 7U), 1, DataType::S16), 
-                                                                                        TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32),  
+                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
+                                                                                        TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
+                                                                                        TensorInfo(TensorShape(3U, 5U, 7U, 7U), 1, DataType::S16),
+                                                                                        TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32),
                                                                                         TensorInfo(TensorShape(37U, 2U, 13U, 27U), 1, DataType::F32),
                                                                                         TensorInfo(TensorShape(37U, 2U, 13U, 27U), 1, DataType::F32)
 
                                                                                     })),
-                                                framework::dataset::make("PermutationVector", { 
+                                                framework::dataset::make("PermutationVector", {
                                                                                                 PermutationVector(2U, 1U, 0U),
                                                                                                 PermutationVector(2U, 2U, 1U),
                                                                                                 PermutationVector(1U, 1U, 1U),
                                                                                                 PermutationVector(2U, 0U, 1U),
-                                                                                                PermutationVector(2U, 0U, 1U), 
+                                                                                                PermutationVector(2U, 0U, 1U),
                                                                                                 PermutationVector(1U, 2U, 0U),
                                                                                                 PermutationVector(3U, 2U, 0U, 1U),
                                                                                                 PermutationVector(3U, 2U, 0U, 1U),
@@ -120,29 +125,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::Small4DShapes(), framework::dataset::make("DataType", { DataType::S8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32 })),
-               shape, data_type)
-{
-    // Define permutation vector
-    const PermutationVector perm(2U, 0U, 1U);
-
-    // Permute shapes
-    TensorShape output_shape = shape;
-    permute(output_shape, perm);
-
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst     = create_tensor<Tensor>(output_shape, data_type);
-
-    // Create and Configure function
-    NEPermute perm_func;
-    perm_func.configure(&ref_src, &dst, perm);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(output_shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEPermuteFixture = PermuteValidationFixture<Tensor, Accessor, NEPermute, T>;
 
diff --git a/tests/validation/NEON/Phase.cpp b/tests/validation/NEON/Phase.cpp
index 37b04f4cef..f63309f09e 100644
--- a/tests/validation/NEON/Phase.cpp
+++ b/tests/validation/NEON/Phase.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,32 +45,6 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_value(1);
 TEST_SUITE(NEON)
 TEST_SUITE(Phase)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::S16)), shape, data_type)
-{
-    // Create tensors
-    Tensor src1 = create_tensor<Tensor>(shape, data_type);
-    Tensor src2 = create_tensor<Tensor>(shape, data_type);
-    Tensor dst  = create_tensor<Tensor>(shape, DataType::U8);
-
-    ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEPhase phase;
-    phase.configure(&src1, &src2, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src1.info()->padding(), padding);
-    validate(src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 template <typename T>
 using NEPhaseFixture = PhaseValidationFixture<Tensor, Accessor, NEPhase, T>;
 
diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
index 0b88628912..1bb0588919 100644
--- a/tests/validation/NEON/PixelWiseMultiplication.cpp
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp
@@ -111,12 +111,14 @@ using NEPixelWiseMultiplicationToU8Fixture = PixelWiseMultiplicationValidationFi
 template <typename T>
 using NEPixelWiseMultiplicationToS16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, int16_t>;
 template <typename T>
+using NEPixelWiseMultiplicationToS32Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, int32_t>;
+template <typename T>
 using NEPixelWiseMultiplicationToF16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, half_float::half>;
 template <typename T>
 using NEPixelWiseMultiplicationToF32Fixture     = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
 using NEPixelWiseMultiplicationU8U8ToS16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t, int16_t>;
 template <typename T>
-using NEPixelWiseMultiplicationBroadcastFixture              = PixelWiseMultiplicationBroadcastValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
+using NEPixelWiseMultiplicationBroadcastFixture              = PixelWiseMultiplicationBroadcastValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, T>;
 using NEPixelWiseMultiplicationBroadcastQASYMM8Fixture       = PixelWiseMultiplicationBroadcastValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t>;
 using NEPixelWiseMultiplicationBroadcastQASYMM8SignedFixture = PixelWiseMultiplicationBroadcastValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, int8_t, int8_t>;
 
@@ -139,6 +141,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),            //11 Mismatching data type
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),            //12 Ok
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),     //13 Quantized cannot do WRAP
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),                //14 S32 does not support scale255
                                                       }),
                framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
@@ -153,6 +156,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
@@ -160,13 +164,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
                                                      })),
                framework::dataset::make("Scale",{  scale_unity,
                                                    scale_unity,
@@ -180,7 +185,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                    scale_unity,
                                                    scale_unity,
                                                    scale_unity,
-                                                   scale_unity})),
+                                                   scale_unity,
+                                                   scale_255})),
                framework::dataset::make("OverflowPolicy",{
                                                    ConvertPolicy::WRAP,
                                                    ConvertPolicy::WRAP,
@@ -195,9 +201,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                                                    ConvertPolicy::WRAP,
                                                    ConvertPolicy::SATURATE,
                                                    ConvertPolicy::WRAP,
+                                                   ConvertPolicy::SATURATE,
                                         })),
 
-               framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true , false, false, true, false })),
+               framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true , false, false, true, false, false})),
                input1_info, input2_info, output_info, scale, policy, expected)
 {
     bool has_error = bool(NEPixelWiseMultiplication::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), scale, policy, RoundingPolicy::TO_ZERO));
@@ -260,7 +267,7 @@ TEST_SUITE_END() // InPlaceValidate
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8_SIGNED)
-TEST_SUITE(Scale255)
+TEST_SUITE(ScaleUnity)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                      framework::dataset::make("DataTypeIn1", DataType::QASYMM8_SIGNED)),
                                                                                                                      framework::dataset::make("DataTypeIn2", DataType::QASYMM8_SIGNED)),
@@ -273,8 +280,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8SignedFixture,
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-TEST_SUITE_END() // Scale255
-TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // ScaleUnity
+TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QASYMM8)
 TEST_SUITE(Scale255)
@@ -476,6 +483,23 @@ TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // S16toS16
 
+TEST_SUITE(S32toS32)
+
+TEST_SUITE(ScaleUnity)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS32Fixture<int32_t>, ALL, SmallShapes(), S32, S32, S32, scale_unity, TO_ZERO, InPlaceDataSet, WRAP_VALIDATE(int32_t, 1))
+TEST_SUITE_END() // ScaleUnity
+
+TEST_SUITE(ScaleOther)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS32Fixture<int32_t>, ALL, SmallShapes(), S32, S32, S32, scale_other, TO_ZERO, InPlaceDataSet, WRAP_VALIDATE(int32_t, 1))
+TEST_SUITE_END() // ScaleOther
+
+TEST_SUITE(Broadcast)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture<int32_t>, ALL, SmallShapesBroadcast(), S32, S32, S32, scale_unity, TO_ZERO, framework::dataset::make("InPlace", { false }),
+                                                 WRAP_VALIDATE(int32_t, 1))
+TEST_SUITE_END() // Broadcast
+
+TEST_SUITE_END() // S32toS32
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16toF16)
 
diff --git a/tests/validation/NEON/QLSTMLayerNormalization.cpp b/tests/validation/NEON/QLSTMLayerNormalization.cpp
index f3cd5fbb56..8925d0b39e 100644
--- a/tests/validation/NEON/QLSTMLayerNormalization.cpp
+++ b/tests/validation/NEON/QLSTMLayerNormalization.cpp
@@ -21,10 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/PaddingCalculator.h"
diff --git a/tests/validation/NEON/QuantizationLayer.cpp b/tests/validation/NEON/QuantizationLayer.cpp
index 0156be275a..04b3a78972 100644
--- a/tests/validation/NEON/QuantizationLayer.cpp
+++ b/tests/validation/NEON/QuantizationLayer.cpp
@@ -74,29 +74,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(QuantizationSmallShapes, framework::dataset::make("DataType", DataType::F32)), shape, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, DataType::QASYMM8);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEQuantizationLayer quant_layer;
-    quant_layer.configure(&src, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 template <typename T>
 using NEQuantizationLayerQASYMM8Fixture = QuantizationValidationFixture<Tensor, Accessor, NEQuantizationLayer, T, uint8_t>;
 template <typename T>
diff --git a/tests/validation/NEON/ROIAlignLayer.cpp b/tests/validation/NEON/ROIAlignLayer.cpp
index 3f6c9d2082..e475c46c7d 100644
--- a/tests/validation/NEON/ROIAlignLayer.cpp
+++ b/tests/validation/NEON/ROIAlignLayer.cpp
@@ -129,10 +129,10 @@ FIXTURE_DATA_TEST_CASE(SmallROIAlignLayerHalf, NEROIAlignLayerHalfFixture, frame
 TEST_SUITE_END() // Float
 
 TEST_SUITE(Quantized)
-TEST_SUITE(QASYMM8)
 template <typename T>
 using NEROIAlignLayerQuantizedFixture = ROIAlignLayerQuantizedFixture<Tensor, Accessor, NEROIAlignLayer, T, uint16_t>;
 
+TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(Small, NEROIAlignLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(datasets::SmallROIDataset(),
                                                        framework::dataset::make("DataType", { DataType::QASYMM8 })),
@@ -144,6 +144,19 @@ FIXTURE_DATA_TEST_CASE(Small, NEROIAlignLayerQuantizedFixture<uint8_t>, framewor
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(Small, NEROIAlignLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(datasets::SmallROIDataset(),
+                                                       framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("InputQuantizationInfo", { QuantizationInfo(1.f / 255.f, 127) })),
+                               framework::dataset::make("OutputQuantizationInfo", { QuantizationInfo(2.f / 255.f, 120) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // RoiAlign
diff --git a/tests/validation/NEON/ReduceMean.cpp b/tests/validation/NEON/ReduceMean.cpp
index 23229a08ef..02cfcee262 100644
--- a/tests/validation/NEON/ReduceMean.cpp
+++ b/tests/validation/NEON/ReduceMean.cpp
@@ -47,7 +47,11 @@ constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value f
 constexpr AbsoluteTolerance<float> tolerance_f16(0.03f); /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
 #endif                                                   // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);    /**< Tolerance value for comparing reference's output against implementation's output for unsigned 8-bit asymmetric quantized type */
-constexpr AbsoluteTolerance<int8_t>  tolerance_s8(1);    /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
+#ifdef __aarch64__
+constexpr AbsoluteTolerance<int8_t> tolerance_s8(1); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
+#else                                                // __aarch64__
+constexpr AbsoluteTolerance<int8_t> tolerance_s8(2); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
+#endif                                               // __aarch64__
 
 const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(0, 1, 2, 3) }),
                                framework::dataset::make("KeepDims", { true }));
@@ -83,28 +87,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::F32 })),
-               shape, data_type)
-{
-    // Create tensors
-    Tensor ref_src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst;
-
-    Coordinates axis(1);
-
-    // Create and Configure function
-    NEReduceMean reduce_mean;
-    reduce_mean.configure(&ref_src, axis, true, &dst);
-
-    // Validate valid region
-    TensorShape output_shape = shape;
-    output_shape.set(1, 1);
-    const ValidRegion valid_region = shape_to_valid_region(output_shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEReduceMeanFixture = ReduceMeanFixture<Tensor, Accessor, NEReduceMean, T>;
 
@@ -160,16 +142,33 @@ TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReduceMeanQuantizedFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 5) })))
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 255, 5) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 255, 5) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_u8);
+}
+
+TEST_SUITE(Requant)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEReduceMeanQuantizedFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), axis_drop),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 255, 5) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 200, 16) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
 }
+TEST_SUITE_END() // Requant
 
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReduceMeanQuantizedFixture<uint8_t>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 5) })))
+                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), concat(axis_keep, axis_drop)),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 255, 5) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 255, 5) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
@@ -180,15 +179,32 @@ TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReduceMeanQuantizedFixture<int8_t>,
                        framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 127, -10), QuantizationInfo(1.f / 250, -20) })))
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), concat(axis_keep, axis_drop)),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 127, -10), QuantizationInfo(1.f / 250, -20) })),
+                               framework::dataset::make("QuantizationInfoInputOutput", { QuantizationInfo(1.f / 127, -10) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_s8);
 }
+TEST_SUITE(Requant)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEReduceMeanQuantizedFixture<int8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), axis_drop),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 102, 2) })),
+                               framework::dataset::make("QuantizationInfoOutput", { QuantizationInfo(1.f / 113, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_s8);
+}
+TEST_SUITE_END() // Requant
+
 FIXTURE_DATA_TEST_CASE(RunLarge,
                        NEReduceMeanQuantizedFixture<int8_t>,
                        framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), concat(axis_keep, axis_drop)), framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 127, 0) })))
+                       combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), concat(axis_keep, axis_drop)),
+                                       framework::dataset::make("QuantizationInfoInput", { QuantizationInfo(1.f / 127, -10) })),
+                               framework::dataset::make("QuantizationInfoInputOutput", { QuantizationInfo(1.f / 127, -10) })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_s8);
diff --git a/tests/validation/NEON/ReductionOperation.cpp b/tests/validation/NEON/ReductionOperation.cpp
index 47b36c630c..ed17e6968e 100644
--- a/tests/validation/NEON/ReductionOperation.cpp
+++ b/tests/validation/NEON/ReductionOperation.cpp
@@ -106,6 +106,28 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                         keep_dims));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
+
+DATA_TEST_CASE(ValidateNoPadding, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis",
+{ 0, 1 })), framework::dataset::make("ReductionOperation", {ReductionOperation::SUM,})), KeepDims),
+               shape, data_type, axis, op, keep_dims)
+{
+    TensorShape         input_shape = TensorShape(shape);
+    TensorInfo input_info   = TensorInfo(input_shape, 1, data_type);
+    const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+    const bool _keep_dims = keep_dims && !is_arg_min_max;
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(shape, axis, keep_dims);
+
+    // Create tensors
+    Tensor src     = create_tensor<Tensor>(input_shape, data_type, 1, QuantizationInfo());
+    Tensor dst     = create_tensor<Tensor>(output_shape, data_type, 1, QuantizationInfo());
+
+    // Create and configure function
+    NEReductionOperation reduction;
+    reduction.configure(&src, &dst, axis, op, _keep_dims);
+
+    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
+    validate(dst.info()->padding(), PaddingSize(0, 0, 0, 0));
+}
 // clang-format on
 // *INDENT-ON*
 
diff --git a/tests/validation/NEON/Remap.cpp b/tests/validation/NEON/Remap.cpp
index f8d7a250f9..1e69973238 100644
--- a/tests/validation/NEON/Remap.cpp
+++ b/tests/validation/NEON/Remap.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,40 +50,6 @@ constexpr float                      tolerance_number = 0.f;
 TEST_SUITE(NEON)
 TEST_SUITE(Remap)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                           framework::dataset::make("DataType", DataType::U8)),
-                                                                   framework::dataset::make("BorderModes", { BorderMode::UNDEFINED, BorderMode::CONSTANT })),
-               shape, policy, data_type, border_mode)
-{
-    Tensor src   = create_tensor<Tensor>(shape, data_type);
-    Tensor map_x = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor map_y = create_tensor<Tensor>(shape, DataType::F32);
-    Tensor dst   = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(map_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(map_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NERemap remap;
-    remap.configure(&src, &map_x, &map_y, &dst, policy, border_mode);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    const int total_right  = ceil_to_multiple(shape[0], 16);
-    const int access_right = total_right + (((total_right - shape[0]) == 0) ? 1 : 0);
-
-    const PaddingSize read_padding(1, access_right - shape[0], 1, 1);
-    validate(src.info()->padding(), read_padding);
-
-    PaddingCalculator calculator(shape.x(), 16);
-    validate(dst.info()->padding(), calculator.required_padding());
-}
-
 template <typename T>
 using NERemapFixture = RemapValidationFixture<Tensor, Accessor, NERemap, T>;
 
diff --git a/tests/validation/NEON/ReorgLayer.cpp b/tests/validation/NEON/ReorgLayer.cpp
index e79a6717ba..cd8c10bd61 100644
--- a/tests/validation/NEON/ReorgLayer.cpp
+++ b/tests/validation/NEON/ReorgLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,44 +69,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallReorgLayerDataset(), datasets::LargeReorgLayerDataset()),
-                                                                           framework::dataset::make("DataType", { DataType::F32, DataType::F16, DataType::QASYMM8 })),
-                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-               shape, stride, data_type, data_layout)
-{
-    // Permute the tensor shape in case of NHWC data layout
-    TensorShape shape_to_use = shape;
-    if(data_layout == DataLayout::NHWC)
-    {
-        permute(shape_to_use, PermutationVector(2U, 0U, 1U));
-    }
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape_to_use, data_type, 1, QuantizationInfo(), data_layout);
-    Tensor dst;
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEReorgLayer reorg_layer;
-
-    // Auto-initialize the output within the function
-    reorg_layer.configure(&src, &dst, stride);
-
-    // Validate valid region
-    const ValidRegion src_valid_region = shape_to_valid_region(shape_to_use);
-    const ValidRegion dst_valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-    validate(src.info()->valid_region(), src_valid_region);
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    const int         step        = 1;
-    const PaddingSize src_padding = PaddingCalculator(shape_to_use.x(), step).required_padding();
-    const PaddingSize dst_padding = PaddingCalculator(dst.info()->tensor_shape().x(), step).required_padding();
-    validate(src.info()->padding(), src_padding);
-    validate(dst.info()->padding(), dst_padding);
-}
-
 template <typename T>
 using NEReorgLayerFixture = ReorgLayerValidationFixture<Tensor, Accessor, NEReorgLayer, T>;
 
diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index 9d9a2821dc..9a1e9b01b3 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp
@@ -79,7 +79,7 @@ const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
 /** Tolerance */
 constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
 constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
-RelativeTolerance<float>             tolerance_f32(0.01);
+RelativeTolerance<float>             tolerance_f32(0.05);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 RelativeTolerance<half> tolerance_f16(half(0.1));
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -117,9 +117,8 @@ const auto output_shape = TensorShape{ 4, 6, 3, 2 };
 constexpr auto default_data_type            = DataType::U8;
 constexpr auto default_data_layout          = DataLayout::NHWC;
 constexpr auto default_interpolation_policy = InterpolationPolicy::NEAREST_NEIGHBOR;
-constexpr auto default_border_mode          = BorderMode::UNDEFINED;
+constexpr auto default_border_mode          = BorderMode::CONSTANT;
 constexpr auto default_sampling_policy      = SamplingPolicy::CENTER;
-constexpr bool default_use_padding          = false;
 
 TEST_CASE(NullPtr, framework::DatasetMode::ALL)
 {
@@ -128,11 +127,11 @@ TEST_CASE(NullPtr, framework::DatasetMode::ALL)
     Status     result{};
 
     // nullptr is given as input
-    result = NEScale::validate(nullptr, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    result = NEScale::validate(nullptr, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 
     // nullptr is given as output
-    result = NEScale::validate(&input, nullptr, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    result = NEScale::validate(&input, nullptr, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
@@ -170,7 +169,7 @@ TEST_CASE(SupportDataType, framework::DatasetMode::ALL)
         const auto input  = TensorInfo{ input_shape, 1, kv.first, default_data_layout };
         const auto output = TensorInfo{ output_shape, 1, kv.first, default_data_layout };
 
-        result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+        result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
         ARM_COMPUTE_EXPECT(bool(result) == kv.second, framework::LogLevel::ERRORS);
     }
 }
@@ -183,7 +182,7 @@ TEST_CASE(MissmatchingDataType, framework::DatasetMode::ALL)
     const auto output = TensorInfo{ output_shape, 1, non_default_data_type, default_data_layout };
     Status     result{};
 
-    result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
@@ -193,9 +192,9 @@ TEST_CASE(UsePadding, framework::DatasetMode::ALL)
     const auto output = TensorInfo{ output_shape, 1, default_data_type, default_data_layout };
     Status     result{};
 
-    // When use padding is false, border mode should be constant
-    constexpr auto border_mode = BorderMode::UNDEFINED;
-    constexpr bool use_padding = false;
+    // Padding is not supported anymore
+    constexpr auto border_mode = BorderMode::CONSTANT;
+    constexpr bool use_padding = true;
 
     result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, border_mode, PixelValue(), default_sampling_policy, use_padding });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
@@ -211,7 +210,7 @@ TEST_CASE(AreaWithNHWC, framework::DatasetMode::ALL)
     const auto output = TensorInfo{ output_shape, 1, default_data_type, data_layout };
     Status     result{};
 
-    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode });
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
@@ -226,7 +225,7 @@ TEST_CASE(AreaWithNonU8, framework::DatasetMode::ALL)
     const auto output = TensorInfo{ output_shape, 1, data_type, data_layout };
     Status     result{};
 
-    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode });
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
@@ -241,11 +240,80 @@ TEST_CASE(AlignedCornerNotSupported, framework::DatasetMode::ALL)
     const auto output = TensorInfo{ output_shape, 1, default_data_type, default_data_layout };
     Status     result{};
 
-    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), sampling_policy, default_use_padding, align_corners });
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false, align_corners });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 TEST_SUITE_END() // Validate
 
+DATA_TEST_CASE(CheckNoPadding, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::Medium4DShapes(),
+                                                                                            framework::dataset::make("DataType", { DataType::F32, DataType::QASYMM8 })),
+                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::BILINEAR, InterpolationPolicy::NEAREST_NEIGHBOR })),
+                                                                            framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER, SamplingPolicy::TOP_LEFT })),
+                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+               shape, data_type, interpolation_policy, sampling_policy, data_layout)
+{
+    constexpr auto  default_border_mode = BorderMode::CONSTANT;
+    ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false);
+
+    // Create tensors
+    Tensor src = create_tensor<Tensor>(shape, data_type);
+    src.info()->set_data_layout(data_layout);
+
+    const float scale_x = 0.5f;
+    const float scale_y = 0.5f;
+    TensorShape shape_scaled(shape);
+    const int   idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int   idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    shape_scaled.set(idx_width, shape[idx_width] * scale_x, /* apply_dim_correction = */ false);
+    shape_scaled.set(idx_height, shape[idx_height] * scale_y, /* apply_dim_correction = */ false);
+    Tensor dst = create_tensor<Tensor>(shape_scaled, data_type);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    NEScale scale;
+    scale.configure(&src, &dst, info);
+
+    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
+    validate(dst.info()->padding(), PaddingSize(0, 0, 0, 0));
+}
+
+DATA_TEST_CASE(CheckNoPaddingInterpAREA, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::Medium4DShapes(),
+                                                                                                      framework::dataset::make("DataType", { DataType::U8 })),
+                                                                                              framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::AREA })),
+                                                                                      framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER, SamplingPolicy::TOP_LEFT })),
+                                                                              framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+               shape, data_type, interpolation_policy, sampling_policy, data_layout)
+{
+    constexpr auto  default_border_mode = BorderMode::CONSTANT;
+    ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false);
+
+    // Create tensors
+    Tensor src = create_tensor<Tensor>(shape, data_type);
+    src.info()->set_data_layout(data_layout);
+
+    const float scale_x = 0.5f;
+    const float scale_y = 0.5f;
+    TensorShape shape_scaled(shape);
+    const int   idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int   idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    shape_scaled.set(idx_width, shape[idx_width] * scale_x, /* apply_dim_correction = */ false);
+    shape_scaled.set(idx_height, shape[idx_height] * scale_y, /* apply_dim_correction = */ false);
+
+    Tensor dst = create_tensor<Tensor>(shape, data_type);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    NEScale scale;
+    scale.configure(&src, &dst, info);
+
+    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
+    validate(dst.info()->padding(), PaddingSize(0, 0, 0, 0));
+}
+
 template <typename T>
 using NEScaleFixture = ScaleValidationFixture<Tensor, Accessor, NEScale, T>;
 template <typename T>
diff --git a/tests/validation/NEON/Schaar.cpp b/tests/validation/NEON/Schaar.cpp
index 85a85cce0a..c093121fca 100644
--- a/tests/validation/NEON/Schaar.cpp
+++ b/tests/validation/NEON/Schaar.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,57 +44,6 @@ TEST_SUITE(Scharr)
 TEST_SUITE(W3x3)
 using NEScharr3x3Fixture = ScharrValidationFixture<Tensor, Accessor, NEScharr3x3, uint8_t, int16_t>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(concat(datasets::Small2DShapes(), datasets::Large2DShapes()), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    Tensor src   = create_tensor<Tensor>(shape, data_type_from_format(format));
-    Tensor dst_x = create_tensor<Tensor>(shape, DataType::S16);
-    Tensor dst_y = create_tensor<Tensor>(shape, DataType::S16);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S16);
-    dst_y.info()->set_format(Format::S16);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create scharr 3x3 configure function
-    NEScharr3x3 scharr;
-    scharr.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 1 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(1);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst_x.info()->padding(), dst_padding);
-    validate(dst_y.info()->padding(), dst_padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, NEScharr3x3Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
                                                                                                         Format::U8)),
                                                                                                 datasets::GradientDimensions()))
diff --git a/tests/validation/NEON/Select.cpp b/tests/validation/NEON/Select.cpp
index 4fe422bda0..199b520ac8 100644
--- a/tests/validation/NEON/Select.cpp
+++ b/tests/validation/NEON/Select.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -18,7 +18,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONCLCTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
@@ -41,8 +41,6 @@ namespace validation
 {
 namespace
 {
-auto configuration_dataset = combine(framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()),
-                                     framework::dataset::make("has_same_rank", { false, true }));
 auto run_small_dataset = combine(datasets::SmallShapes(), framework::dataset::make("has_same_rank", { false, true }));
 auto run_large_dataset = combine(datasets::LargeShapes(), framework::dataset::make("has_same_rank", { false, true }));
 } // namespace
@@ -100,26 +98,6 @@ TEST_SUITE(Float)
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, configuration_dataset,
-               shape, same_rank)
-{
-    const DataType dt = DataType::F16;
-
-    // Create tensors
-    Tensor ref_c = create_tensor<Tensor>(detail::select_condition_shape(shape, same_rank), DataType::U8);
-    Tensor ref_x = create_tensor<Tensor>(shape, dt);
-    Tensor ref_y = create_tensor<Tensor>(shape, dt);
-    Tensor dst   = create_tensor<Tensor>(shape, dt);
-
-    // Create and Configure function
-    NESelect select;
-    select.configure(&ref_c, &ref_x, &ref_y, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NESelectFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
@@ -141,26 +119,6 @@ TEST_SUITE_END() // F16
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, configuration_dataset,
-               shape, same_rank)
-{
-    const DataType dt = DataType::F32;
-
-    // Create tensors
-    Tensor ref_c = create_tensor<Tensor>(detail::select_condition_shape(shape, same_rank), DataType::U8);
-    Tensor ref_x = create_tensor<Tensor>(shape, dt);
-    Tensor ref_y = create_tensor<Tensor>(shape, dt);
-    Tensor dst   = create_tensor<Tensor>(shape, dt);
-
-    // Create and Configure function
-    NESelect select;
-    select.configure(&ref_c, &ref_x, &ref_y, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NESelectFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
diff --git a/tests/validation/NEON/Slice.cpp b/tests/validation/NEON/Slice.cpp
index 1b35bfa30c..54b0fbf9d2 100644
--- a/tests/validation/NEON/Slice.cpp
+++ b/tests/validation/NEON/Slice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,24 +63,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(arm_compute::test::datasets::SmallSliceDataset(), framework::dataset::make("DataType", { DataType::QASYMM8, DataType::F32 })),
-               shape, starts, ends, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst;
-
-    // Create and Configure function
-    NESlice slice;
-    slice.configure(&src, &dst, starts, ends);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NESliceFixture = SliceFixture<Tensor, Accessor, NESlice, T>;
 
diff --git a/tests/validation/NEON/Sobel.cpp b/tests/validation/NEON/Sobel.cpp
index 2765057ae7..e090bcd6e2 100644
--- a/tests/validation/NEON/Sobel.cpp
+++ b/tests/validation/NEON/Sobel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,57 +49,6 @@ TEST_SUITE(Sobel)
 TEST_SUITE(W3x3)
 using NESobel3x3Fixture = SobelValidationFixture<Tensor, Accessor, NESobel3x3, uint8_t, int16_t>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    Tensor src   = create_tensor<Tensor>(shape, data_type_from_format(format));
-    Tensor dst_x = create_tensor<Tensor>(shape, DataType::S16);
-    Tensor dst_y = create_tensor<Tensor>(shape, DataType::S16);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S16);
-    dst_y.info()->set_format(Format::S16);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create sobel 3x3 configure function
-    NESobel3x3 sobel;
-    sobel.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 1 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(1);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-1);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst_x.info()->padding(), dst_padding);
-    validate(dst_y.info()->padding(), dst_padding);
-}
-
 TEST_SUITE(X)
 FIXTURE_DATA_TEST_CASE(RunSmall, NESobel3x3Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
                                                                                                        Format::U8)),
@@ -168,56 +117,6 @@ TEST_SUITE_END()
 TEST_SUITE(W5x5)
 using NESobel5x5Fixture = SobelValidationFixture<Tensor, Accessor, NESobel5x5, uint8_t, int16_t>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    Tensor src   = create_tensor<Tensor>(shape, data_type_from_format(format));
-    Tensor dst_x = create_tensor<Tensor>(shape, DataType::S16);
-    Tensor dst_y = create_tensor<Tensor>(shape, DataType::S16);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S16);
-    dst_y.info()->set_format(Format::S16);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create sobel 5x5 configure function
-    NESobel5x5 sobel;
-    sobel.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 2 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 16);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(2);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_processed_elements(8);
-    calculator.set_access_offset(-2);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst_x.info()->padding(), dst_padding);
-    validate(dst_y.info()->padding(), dst_padding);
-}
 TEST_SUITE(X)
 FIXTURE_DATA_TEST_CASE(RunSmall, NESobel5x5Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
                                                                                                        Format::U8)),
@@ -285,57 +184,6 @@ TEST_SUITE_END()
 
 TEST_SUITE(W7x7)
 using NESobel7x7Fixture = SobelValidationFixture<Tensor, Accessor, NESobel7x7, uint8_t, int32_t>;
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
-                                                                   Format::U8)),
-               shape, border_mode, format)
-{
-    // Generate a random constant value
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> int_dist(0, 255);
-    const uint8_t                          constant_border_value = int_dist(gen);
-
-    // Create tensors
-    Tensor src   = create_tensor<Tensor>(shape, data_type_from_format(format));
-    Tensor dst_x = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor dst_y = create_tensor<Tensor>(shape, DataType::S32);
-
-    src.info()->set_format(format);
-    dst_x.info()->set_format(Format::S32);
-    dst_y.info()->set_format(Format::S32);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_x.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst_y.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create sobel 7x7 configure function
-    NESobel7x7 sobel;
-    sobel.configure(&src, &dst_x, &dst_y, border_mode, constant_border_value);
-
-    // Validate valid region
-    constexpr BorderSize border_size{ 3 };
-    const ValidRegion    dst_valid_region = shape_to_valid_region(shape, border_mode == BorderMode::UNDEFINED, border_size);
-
-    validate(dst_x.info()->valid_region(), dst_valid_region);
-    validate(dst_y.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 8);
-
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(3);
-
-    const PaddingSize dst_padding = calculator.required_padding();
-
-    calculator.set_accessed_elements(16);
-    calculator.set_access_offset(-3);
-
-    const PaddingSize src_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), src_padding);
-    validate(dst_x.info()->padding(), dst_padding);
-    validate(dst_y.info()->padding(), dst_padding);
-}
 TEST_SUITE(X)
 FIXTURE_DATA_TEST_CASE(RunSmall, NESobel7x7Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), datasets::BorderModes()), framework::dataset::make("Format",
                                                                                                        Format::U8)),
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index 70203d9ce9..2a9e30604e 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -73,6 +73,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
                                                                   QuantizationInfo(1.f/256, 12)),
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
                                                                   QuantizationInfo(1.f/256, 12)),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
@@ -85,6 +86,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
                                                                   QuantizationInfo(1.f/256, 0)),
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
                                                                   QuantizationInfo(1.f/256, 0)),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
@@ -95,18 +97,20 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                   1.0,
                                                   2.0,
                                                   1.0,
+                                                  1.0,
                                                   2.0,
                                                   1.0,
                                                 })),
                framework::dataset::make("axis", { 0,
                                                   0,
                                                   0,
+                                                  1,
                                                   0,
-                                                  0,
+                                                  -1,
                                                   2,
                                                   -3,
                                                 })),
-               framework::dataset::make("Expected", { false, false, false, true, true, false, false })),
+               framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })),
                input_info, output_info, beta, axis, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
@@ -123,7 +127,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                  framework::dataset::make("DataType", DataType::F16)),
                                                                                                                  framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                         framework::dataset::make("Axis", { 0 })))
+                                                                                                         framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -131,7 +135,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::Dataset
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                           framework::dataset::make("Axis", { 0 })))
+                                                                                                           framework::dataset::make("Axis", { 0, 2, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -151,7 +155,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
                                                                                                                     framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0 })))
+                                                                                                            framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -159,7 +163,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::Data
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
                                                                                                                     framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0 })))
+                                                                                                            framework::dataset::make("Axis", { 0, -2, 3 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -184,7 +188,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, fram
                                                                                                                  framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                  combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                          framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0 })))
+                                                                                                                 framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -193,7 +197,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, fram
                                                                                                                  framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                  combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                          framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0 })))
+                                                                                                                 framework::dataset::make("Axis", { 0, 1, -2 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -214,7 +218,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, frame
                                                                                                                         framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                         combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0 })))
+                                                                                                                framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
@@ -223,7 +227,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, frame
                                                                                                                         framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                         combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0 })))
+                                                                                                                framework::dataset::make("Axis", { 0, 1, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
diff --git a/tests/validation/NEON/Split.cpp b/tests/validation/NEON/Split.cpp
index a80f9acc88..e7133fa530 100644
--- a/tests/validation/NEON/Split.cpp
+++ b/tests/validation/NEON/Split.cpp
@@ -91,66 +91,6 @@ DATA_TEST_CASE(ValidateSplitShapes, framework::DatasetMode::ALL, zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallSplitDataset(), framework::dataset::make("DataType", { DataType::QASYMM8, DataType::F32 })),
-               shape, axis, splits, data_type)
-{
-    // Create tensors
-    Tensor                 src = create_tensor<Tensor>(shape, data_type);
-    std::vector<Tensor>    dsts(splits);
-    std::vector<ITensor *> dsts_ptrs;
-    dsts_ptrs.reserve(splits);
-    for(auto &dst : dsts)
-    {
-        dsts_ptrs.emplace_back(&dst);
-    }
-
-    // Create and Configure function
-    NESplit split;
-    split.configure(&src, dsts_ptrs, axis);
-
-    // Validate valid regions
-    for(auto &dst : dsts)
-    {
-        const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-        validate(dst.info()->valid_region(), valid_region);
-    }
-}
-
-DATA_TEST_CASE(ConfigurationSplitShapes,
-               framework::DatasetMode::ALL,
-               combine(datasets::SmallSplitShapesDataset(), framework::dataset::make("DataType", { DataType::F16, DataType::F32 })),
-               shape, axis, split_shapes, data_type)
-{
-    // Create tensors
-    Tensor              src = create_tensor<Tensor>(shape, data_type);
-    std::vector<Tensor> dsts;
-
-    for(const auto &split_shape : split_shapes)
-    {
-        Tensor dst = create_tensor<Tensor>(split_shape, data_type);
-        dsts.push_back(std::move(dst));
-    }
-
-    std::vector<ITensor *> dsts_ptrs;
-    for(auto &dst : dsts)
-    {
-        dsts_ptrs.emplace_back(&dst);
-    }
-
-    // Create and Configure function
-    NESplit split;
-    split.configure(&src, dsts_ptrs, axis);
-
-    // Validate valid regions
-    for(auto &dst : dsts)
-    {
-        const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-        validate(dst.info()->valid_region(), valid_region);
-    }
-}
-
 template <typename T>
 using NESplitFixture = SplitFixture<Tensor, ITensor, Accessor, NESplit, T>;
 
diff --git a/tests/validation/NEON/StackLayer.cpp b/tests/validation/NEON/StackLayer.cpp
index 9ba709a3d5..df0de81b4f 100644
--- a/tests/validation/NEON/StackLayer.cpp
+++ b/tests/validation/NEON/StackLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,35 +77,6 @@ const auto shapes_3d_large = combine(datasets::Medium3DShapes(), framework::data
 
 /** Shapes 4D to test */
 const auto shapes_4d_large = combine(datasets::Medium4DShapes(), framework::dataset::make("Axis", -4, 5));
-
-/** Configuration test */
-void validate_configuration(TensorShape shape_in, int axis, DataType data_type, int num_tensors)
-{
-    // Wrap around negative values
-    const unsigned int axis_u = wrap_around(axis, static_cast<int>(shape_in.num_dimensions() + 1));
-
-    const TensorShape shape_dst = compute_stack_shape(TensorInfo(shape_in, 1, data_type), axis_u, num_tensors);
-
-    std::vector<Tensor>   tensors(num_tensors);
-    std::vector<ITensor*> src(num_tensors);
-
-    // Create vector of input tensors
-    for(int i = 0; i < num_tensors; ++i)
-    {
-        tensors[i] = create_tensor<Tensor>(shape_in, data_type);
-        src[i]     = &(tensors[i]);
-        ARM_COMPUTE_EXPECT(src[i]->info()->is_resizable(), framework::LogLevel::ERRORS);
-    }
-
-    // Create tensors
-    Tensor dst = create_tensor<Tensor>(shape_dst, data_type);
-
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEStackLayer stack;
-    stack.configure(src, axis, &dst);
-}
 } // namespace
 
 /** Fixture to use */
@@ -148,15 +119,6 @@ input_info, output_info, axis, expected)
 }
 
 TEST_SUITE(Shapes1D)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(shapes_1d_small,
-                                                                           data_types),
-                                                                           n_values),
-shape_in, axis, data_type, num_tensors)
-{
-    validate_configuration(shape_in, axis, data_type, num_tensors);
-}
-
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
                                                            combine(combine(shapes_1d_small,
@@ -219,15 +181,6 @@ TEST_SUITE_END() // S8
 TEST_SUITE_END() // Shapes1D
 
 TEST_SUITE(Shapes2D)
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(shapes_2d_small,
-                                                                           data_types),
-                                                                           n_values),
-shape_in, axis, data_type, num_tensors)
-{
-    validate_configuration(shape_in, axis, data_type, num_tensors);
-}
-
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
                                                            combine(combine(shapes_2d_small,
@@ -290,14 +243,6 @@ TEST_SUITE_END() // S8
 TEST_SUITE_END() // Shapes2D
 
 TEST_SUITE(Shapes3D)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(shapes_3d_small,
-                                                                           data_types),
-                                                                           n_values),
-shape_in, axis, data_type, num_tensors)
-{
-    validate_configuration(shape_in, axis, data_type, num_tensors);
-}
-
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
                                                            combine(combine(shapes_3d_small,
@@ -360,14 +305,6 @@ TEST_SUITE_END() // S8
 TEST_SUITE_END() // Shapes3D
 
 TEST_SUITE(Shapes4D)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(shapes_4d_small,
-                                                                           data_types),
-                                                                           n_values),
-shape_in, axis, data_type, num_tensors)
-{
-    validate_configuration(shape_in, axis, data_type, num_tensors);
-}
-
 TEST_SUITE(S32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEStackLayerFixture<int>, framework::DatasetMode::ALL,
                                                            combine(combine(shapes_4d_small,
diff --git a/tests/validation/NEON/StridedSlice.cpp b/tests/validation/NEON/StridedSlice.cpp
index 91d5a64f76..8332134b1b 100644
--- a/tests/validation/NEON/StridedSlice.cpp
+++ b/tests/validation/NEON/StridedSlice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,24 +65,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
 // clang-format on
 // *INDENT-ON*
 
-DATA_TEST_CASE(Configuration,
-               framework::DatasetMode::ALL,
-               combine(arm_compute::test::datasets::SmallStridedSliceDataset(), framework::dataset::make("DataType", { DataType::QASYMM8, DataType::F32 })),
-               shape, starts, ends, strides, begin_mask, end_mask, shrink_mask, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst;
-
-    // Create and Configure function
-    NEStridedSlice strided_slice;
-    strided_slice.configure(&src, &dst, starts, ends, strides, begin_mask, end_mask, shrink_mask);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 template <typename T>
 using NEStridedSliceFixture = StridedSliceFixture<Tensor, Accessor, NEStridedSlice, T>;
 
diff --git a/tests/validation/NEON/TableLookup.cpp b/tests/validation/NEON/TableLookup.cpp
index 647c486f7a..cbd16c99f3 100644
--- a/tests/validation/NEON/TableLookup.cpp
+++ b/tests/validation/NEON/TableLookup.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,42 +50,7 @@ TEST_SUITE(TableLookup)
 template <typename T>
 using NETableLookupFixture = TableLookupValidationFixture<Tensor, Accessor, NETableLookup, LutAccessor<T>, Lut, T>;
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-               shape, data_type)
-{
-    // Create Lut
-    const int num_elem = (data_type == DataType::U8) ? std::numeric_limits<uint8_t>::max() + 1 : std::numeric_limits<int16_t>::max() - std::numeric_limits<int16_t>::lowest() + 1;
-    Lut       lut(num_elem, data_type);
-
-    switch(data_type)
-    {
-        case DataType::U8:
-            fill_lookuptable(LutAccessor<uint8_t>(lut));
-            break;
-        case DataType::S16:
-            fill_lookuptable(LutAccessor<int16_t>(lut));
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
 
-    // Create and Configure function
-    NETableLookup table_lookup;
-    table_lookup.configure(&src, &lut, &dst);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
 FIXTURE_DATA_TEST_CASE(RunSmallU8, NETableLookupFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)))
 {
     // Validate output
diff --git a/tests/validation/NEON/Threshold.cpp b/tests/validation/NEON/Threshold.cpp
index 917a8a2b90..97e98d7224 100644
--- a/tests/validation/NEON/Threshold.cpp
+++ b/tests/validation/NEON/Threshold.cpp
@@ -40,30 +40,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(Threshold)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), datasets::MixedThresholdDataset()),
-                                                                   framework::dataset::make("DataType", DataType::U8)),
-               shape, threshold, false_value, true_value, type, upper, data_type)
-{
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEThreshold thrsh;
-    thrsh.configure(&src, &dst, ThresholdKernelInfo(threshold, false_value, true_value, type, upper));
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    validate(src.info()->padding(), PaddingSize());
-    validate(dst.info()->padding(), PaddingSize());
-}
-
 template <typename T>
 using ThresholdFixture = ThresholdValidationFixture<Tensor, Accessor, NEThreshold, T>;
 
diff --git a/tests/validation/NEON/Upsample.cpp b/tests/validation/NEON/Upsample.cpp
index 221f6904d4..799e513fb1 100644
--- a/tests/validation/NEON/Upsample.cpp
+++ b/tests/validation/NEON/Upsample.cpp
@@ -41,31 +41,6 @@ namespace validation
 TEST_SUITE(NEON)
 TEST_SUITE(UpsampleLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, (combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32))),
-               input_shape, data_type)
-{
-    InterpolationPolicy policy = InterpolationPolicy::NEAREST_NEIGHBOR;
-    Size2D              info   = Size2D(2, 2);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(input_shape, data_type, 1);
-    Tensor dst;
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEUpsampleLayer upsample;
-    upsample.configure(&src, &dst, info, policy);
-
-    // Validate valid region
-    const ValidRegion src_valid_region = shape_to_valid_region(src.info()->tensor_shape());
-    const ValidRegion dst_valid_region = shape_to_valid_region(dst.info()->tensor_shape());
-
-    validate(src.info()->valid_region(), src_valid_region);
-    validate(dst.info()->valid_region(), dst_valid_region);
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
diff --git a/tests/validation/NEON/WarpAffine.cpp b/tests/validation/NEON/WarpAffine.cpp
index ce5360b2cf..92dfe23736 100644
--- a/tests/validation/NEON/WarpAffine.cpp
+++ b/tests/validation/NEON/WarpAffine.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,48 +53,6 @@ constexpr AbsoluteTolerance<uint8_t> tolerance(1);
 TEST_SUITE(NEON)
 TEST_SUITE(WarpAffine)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                           framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                   datasets::BorderModes()),
-               shape, data_type, policy, border_mode)
-{
-    // Generate a random constant value if border_mode is constant
-    std::mt19937                           gen(library->seed());
-    std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-    uint8_t                                constant_border_value = distribution_u8(gen);
-
-    // Create the matrix
-    std::array<float, 9> matrix{ {} };
-    fill_warp_matrix<9>(matrix);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEWarpAffine warp_affine;
-    warp_affine.configure(&src, &dst, matrix, policy, border_mode, constant_border_value);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 1);
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(1);
-
-    const PaddingSize read_padding(1);
-    const PaddingSize write_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), read_padding);
-    validate(dst.info()->padding(), write_padding);
-}
-
 template <typename T>
 using NEWarpAffineFixture = WarpAffineValidationFixture<Tensor, Accessor, NEWarpAffine, T>;
 
diff --git a/tests/validation/NEON/WarpPerspective.cpp b/tests/validation/NEON/WarpPerspective.cpp
index d146bda705..1c56220a21 100644
--- a/tests/validation/NEON/WarpPerspective.cpp
+++ b/tests/validation/NEON/WarpPerspective.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,54 +50,6 @@ constexpr float                      tolerance_number = 0.2f;
 TEST_SUITE(NEON)
 TEST_SUITE(WarpPerspective)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
-                                                                           framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                   datasets::BorderModes()),
-               shape, data_type, policy, border_mode)
-{
-    uint8_t constant_border_value = 0;
-
-    // Generate a random constant value if border_mode is constant
-    if(border_mode == BorderMode::CONSTANT)
-    {
-        std::mt19937                           gen(library->seed());
-        std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-        constant_border_value = distribution_u8(gen);
-    }
-
-    // Create the matrix
-    std::array<float, 9> matrix = { { 0 } };
-    fill_warp_matrix<9>(matrix);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(shape, data_type);
-    Tensor dst = create_tensor<Tensor>(shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEWarpPerspective warp_perspective;
-    warp_perspective.configure(&src, &dst, matrix, policy, border_mode, constant_border_value);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-
-    validate(src.info()->valid_region(), valid_region);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape.x(), 1);
-    calculator.set_border_mode(border_mode);
-    calculator.set_border_size(1);
-
-    const PaddingSize read_padding(1);
-    const PaddingSize write_padding = calculator.required_padding();
-
-    validate(src.info()->padding(), read_padding);
-    validate(dst.info()->padding(), write_padding);
-}
-
 template <typename T>
 using NEWarpPerspectiveFixture = WarpPerspectiveValidationFixture<Tensor, Accessor, NEWarpPerspective, T>;
 
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index ec13e1d3e0..e1452f5dfc 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -42,12 +42,22 @@
 
 namespace arm_compute
 {
-class NEConvolutionLayer;
-
 namespace test
 {
 namespace validation
 {
+namespace detail
+{
+template <typename ConvolutionFunction, typename TensorType>
+void configure_conv_function(ConvolutionFunction &func,
+                             TensorType *src, const TensorType *weights, const TensorType *bias, TensorType *dst,
+                             const PadStrideInfo &info, const WeightsInfo &weights_info,
+                             const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+    func.configure(src, weights, bias, dst, info, weights_info, dilation, act_info, num_groups);
+}
+} // namespace detail
+
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW>
 class ConvolutionValidationGenericFixture : public framework::Fixture
 {
@@ -171,7 +181,7 @@ class ConvolutionValidationGenericFixture : public framework::Fixture
 
         // Create and configure function
         FunctionType conv;
-        conv.configure(&src, &weights, &bias, &dst, info, weights_info, dilation, act_info, num_groups);
+        detail::configure_conv_function(conv, &src, &weights, &bias, &dst, info, weights_info, dilation, act_info, num_groups);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
diff --git a/tests/validation/fixtures/DirectConvolutionLayerFixture.h b/tests/validation/fixtures/DirectConvolutionLayerFixture.h
index 3da5158e97..e37063e2e5 100644
--- a/tests/validation/fixtures/DirectConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DirectConvolutionLayerFixture.h
@@ -51,13 +51,10 @@ class DirectConvolutionValidationGenericFixture : public framework::Fixture
 public:
     using TBias = typename std::conditional < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int32_t, T >::type;
 
-public:
     template <typename...>
     void setup(TensorShape input_shape, int stride_x, int stride_y, int pad_x, int pad_y, unsigned int kernel_size, unsigned int num_kernels,
                DataType data_type, QuantizationInfo quantization_info, ActivationLayerInfo act_info, DataLayout data_layout)
     {
-        ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN);
-
         _quantization_info = quantization_info;
         _data_type         = data_type;
 
diff --git a/tests/validation/fixtures/ElementwiseOperationsFixture.h b/tests/validation/fixtures/ElementwiseOperationsFixture.h
index ebc52d5083..dcb408c801 100644
--- a/tests/validation/fixtures/ElementwiseOperationsFixture.h
+++ b/tests/validation/fixtures/ElementwiseOperationsFixture.h
@@ -59,16 +59,23 @@ class ArithmeticOperationsGenericFixture : public framework::Fixture
     template <typename U>
     void fill(U &&tensor, int i)
     {
-        switch(_op)
+        if(is_data_type_float(tensor.data_type()))
         {
-            case ArithmeticOperation::DIV:
-                library->fill_tensor_uniform_ranged(tensor, i, { std::pair<float, float>(-0.001f, 0.001f) });
-                break;
-            case ArithmeticOperation::POWER:
-                library->fill_tensor_uniform(tensor, i, 0.0f, 5.0f);
-                break;
-            default:
-                library->fill_tensor_uniform(tensor, i);
+            switch(_op)
+            {
+                case ArithmeticOperation::DIV:
+                    library->fill_tensor_uniform_ranged(tensor, i, { std::pair<float, float>(-0.001f, 0.001f) });
+                    break;
+                case ArithmeticOperation::POWER:
+                    library->fill_tensor_uniform(tensor, i, 0.0f, 5.0f);
+                    break;
+                default:
+                    library->fill_tensor_uniform(tensor, i);
+            }
+        }
+        else
+        {
+            library->fill_tensor_uniform(tensor, i);
         }
     }
 
diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index 0a964a7114..358056ad65 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h
@@ -702,8 +702,11 @@ class GEMMMatrixMultiplyReshapedValidationFixture : public framework::Fixture
                                      broadcast_bias ? 1 : m,
                                      broadcast_bias ? 1 : batch_size);
 
-        _target    = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info);
-        _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info);
+        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info);
+        if(validate_result)
+        {
+            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info);
+        }
     }
 
 protected:
@@ -748,6 +751,14 @@ class GEMMMatrixMultiplyReshapedValidationFixture : public framework::Fixture
         ReshapeLHSFunctionType reshape_lhs;
         ReshapeRHSFunctionType reshape_rhs;
         GEMMFunctionType       gemm;
+
+        validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
+        validate_result = validate_result || !rhs_info.export_to_cl_image;
+        if(!validate_result)
+        {
+            return nullptr;
+        }
+
         reshape_lhs.configure(&lhs, &lhs_reshaped, lhs_info);
         reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
         gemm.configure(&lhs_reshaped, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
@@ -824,6 +835,7 @@ class GEMMMatrixMultiplyReshapedValidationFixture : public framework::Fixture
         }
     }
 
+    bool            validate_result = true;
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
@@ -859,8 +871,11 @@ class GEMMMatrixMultiplyReshaped3DValidationFixture : public framework::Fixture
         const TensorShape rhs_shape(n, k, batch_size);
         const TensorShape bias_shape(n, 1, 1);
 
-        _target    = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, m_h, act_info);
-        _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, m_h, act_info);
+        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, m_h, act_info);
+        if(validate_result)
+        {
+            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, m_h, act_info);
+        }
     }
 
 protected:
@@ -901,6 +916,14 @@ class GEMMMatrixMultiplyReshaped3DValidationFixture : public framework::Fixture
         ReshapeLHSFunctionType reshape_lhs;
         ReshapeRHSFunctionType reshape_rhs;
         GEMMFunctionType       gemm;
+
+        validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
+        validate_result = validate_result || !rhs_info.export_to_cl_image;
+        if(!validate_result)
+        {
+            return nullptr;
+        }
+
         reshape_lhs.configure(&lhs, &lhs_reshaped, lhs_info);
         reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
         gemm.configure(&lhs_reshaped, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
@@ -976,6 +999,7 @@ class GEMMMatrixMultiplyReshaped3DValidationFixture : public framework::Fixture
         }
     }
 
+    bool            validate_result = true;
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
@@ -1007,8 +1031,11 @@ class GEMMMatrixMultiplyReshapedOnlyRHSValidationFixture : public framework::Fix
                                      broadcast_bias ? 1 : m,
                                      broadcast_bias ? 1 : batch_size);
 
-        _target    = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info);
-        _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info);
+        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info);
+        if(validate_result)
+        {
+            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info);
+        }
     }
 
 protected:
@@ -1050,6 +1077,14 @@ class GEMMMatrixMultiplyReshapedOnlyRHSValidationFixture : public framework::Fix
         // Create and configure function
         ReshapeRHSFunctionType reshape_rhs;
         GEMMFunctionType       gemm;
+
+        validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
+        validate_result = validate_result || !rhs_info.export_to_cl_image;
+        if(!validate_result)
+        {
+            return nullptr;
+        }
+
         reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
         gemm.configure(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
 
@@ -1115,6 +1150,7 @@ class GEMMMatrixMultiplyReshapedOnlyRHSValidationFixture : public framework::Fix
         return reference::activation_layer(reference::gemm<T>(lhs, rhs, bias, alpha, beta), act_info);
     }
 
+    bool            validate_result = true;
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
@@ -1125,7 +1161,7 @@ class GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::F
 public:
     template <typename...>
     void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int h0,
-               bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, const ActivationLayerInfo &act_info)
+               bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, bool has_pad_y, DataType data_type, float alpha, float beta, const ActivationLayerInfo &act_info)
     {
         GEMMLHSMatrixInfo lhs_info;
         lhs_info.m0 = m0;
@@ -1147,8 +1183,11 @@ class GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::F
         const TensorShape rhs_shape(n, k, batch_size);
         const TensorShape bias_shape(n, 1, 1);
 
-        _target    = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, m_h, act_info);
-        _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, m_h, act_info);
+        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, m_h, act_info, has_pad_y);
+        if(validate_result)
+        {
+            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, m_h, act_info);
+        }
     }
 
 protected:
@@ -1161,7 +1200,7 @@ class GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::F
 
     TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
                               DataType data_type, float alpha, float beta,
-                              unsigned int m_h, const ActivationLayerInfo &act_info)
+                              unsigned int m_h, const ActivationLayerInfo &act_info, bool has_pad_y)
     {
         // Create tensors
         TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1);
@@ -1181,15 +1220,30 @@ class GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::F
         kernel_info.reinterpret_input_as_3d = false;
         kernel_info.broadcast_bias          = true;
         kernel_info.activation_info         = act_info;
+        kernel_info.has_pad_y               = has_pad_y;
 
         // The output tensor will be auto-initialized within the function
-
         // Create and configure function
         ReshapeRHSFunctionType reshape_rhs;
         GEMMFunctionType       gemm;
+
+        validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
+        validate_result = validate_result || !rhs_info.export_to_cl_image;
+        if(!validate_result)
+        {
+            return nullptr;
+        }
+
         reshape_rhs.configure(&rhs, &rhs_reshaped, rhs_info);
         gemm.configure(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
 
+        if(has_pad_y)
+        {
+            // Add dummy padding into lhs to validate has_pad_y path
+            lhs.info()->extend_padding(PaddingSize(2, 0, 2, 0));
+            dst.info()->extend_padding(PaddingSize(2, 0, 1, 0));
+        }
+
         ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -1251,6 +1305,7 @@ class GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::F
         return reference::activation_layer(reference::gemm<T>(lhs, rhs, bias, alpha, beta), act_info);
     }
 
+    bool            validate_result = true;
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
diff --git a/tests/validation/fixtures/LocallyConnectedFixture.h b/tests/validation/fixtures/LocallyConnectedFixture.h
deleted file mode 100644
index 2e2b71665b..0000000000
--- a/tests/validation/fixtures/LocallyConnectedFixture.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_LOCALLY_CONNECTED_FIXTURE
-#define ARM_COMPUTE_TEST_LOCALLY_CONNECTED_FIXTURE
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/LocallyConnected.h"
-#include "tests/validation/reference/Utils.h"
-
-#include <random>
-
-namespace arm_compute
-{
-class NELocallyConnected;
-
-namespace test
-{
-namespace validation
-{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class LocallyConnectedValidationFixture : public framework::Fixture
-{
-public:
-    using TBias = typename std::conditional<std::is_same<typename std::decay<T>::type, uint8_t>::value, int32_t, T>::type;
-
-public:
-    template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, DataType data_type)
-    {
-        ARM_COMPUTE_UNUSED(dilation);
-
-        _data_type      = data_type;
-        _bias_data_type = data_type;
-
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info);
-    }
-
-protected:
-    template <typename U>
-    void fill(U &&tensor, int i)
-    {
-        std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
-        library->fill(tensor, distribution, i);
-    }
-
-    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
-    {
-        TensorShape reshaped_weights_shape(weights_shape);
-
-        // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, _data_type);
-        TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, _data_type);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, _bias_data_type);
-        TensorType dst     = create_tensor<TensorType>(output_shape, _data_type);
-
-        // Create and configure function
-        FunctionType locally_connected;
-        locally_connected.configure(&src, &weights, &bias, &dst, info);
-
-        // Allocate tensors
-        src.allocator()->allocate();
-        weights.allocator()->allocate();
-        bias.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Fill tensors
-        fill(AccessorType(src), 0);
-        fill(AccessorType(weights), 1);
-        fill(AccessorType(bias), 2);
-
-        locally_connected.run();
-
-        return dst;
-    }
-
-    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
-    {
-        // Create reference
-        SimpleTensor<T>     src(input_shape, _data_type);
-        SimpleTensor<T>     weights(weights_shape, _data_type);
-        SimpleTensor<TBias> bias(bias_shape, _bias_data_type);
-
-        // Fill reference
-        fill(src, 0);
-        fill(weights, 1);
-        fill(bias, 2);
-
-        return reference::locally_connected<T>(src, weights, bias, output_shape, info);
-    }
-
-    TensorType      _target{};
-    SimpleTensor<T> _reference{};
-    DataType        _data_type{};
-    DataType        _bias_data_type{};
-};
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_LOCALLY_CONNECTED_FIXTURE */
diff --git a/tests/validation/fixtures/LogicalFixture.h b/tests/validation/fixtures/LogicalFixture.h
new file mode 100644
index 0000000000..4bedb378bb
--- /dev/null
+++ b/tests/validation/fixtures/LogicalFixture.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_LOGICAL_FIXTURE
+#define ARM_COMPUTE_TEST_LOGICAL_FIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/Logical.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class LogicalOperationValidationFixtureBase : public framework::Fixture
+{
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        constexpr auto zero              = (uint8_t)0;
+        constexpr auto one               = (uint8_t)0x1;
+        constexpr auto mixed             = (uint8_t)0xAA;
+        constexpr auto mixed_bitwise_not = (uint8_t) ~(0xAA);
+
+        library->fill_static_values(tensor, i == 0 ?
+                                    std::vector<uint8_t> { zero, one, zero, one, mixed, zero, mixed } :
+                                    std::vector<uint8_t> { zero, zero, one, one, zero, mixed, mixed_bitwise_not });
+    }
+
+    void allocate_tensor(std::initializer_list<TensorType *> tensors)
+    {
+        for(auto t : tensors)
+        {
+            ARM_COMPUTE_EXPECT(t->info()->is_resizable(), framework::LogLevel::ERRORS);
+            t->allocator()->allocate();
+            ARM_COMPUTE_EXPECT(!t->info()->is_resizable(), framework::LogLevel::ERRORS);
+        }
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+template <typename T>
+using LogicalBinaryRefFunctionPtrType = SimpleTensor<T>(const SimpleTensor<T> &, const SimpleTensor<T> &);
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, LogicalBinaryRefFunctionPtrType<T> RefFunction>
+class LogicalBinaryOperationValidationFixture : public LogicalOperationValidationFixtureBase<TensorType, AccessorType, FunctionType, T>
+{
+    using Parent = LogicalOperationValidationFixtureBase<TensorType, AccessorType, FunctionType, T>;
+
+public:
+    template <typename...>
+    void setup(TensorShape shape0, TensorShape shape1)
+    {
+        Parent::_target    = compute_target(shape0, shape1);
+        Parent::_reference = compute_reference(shape0, shape1);
+    }
+
+private:
+    TensorType compute_target(const TensorShape &shape0, const TensorShape &shape1)
+    {
+        TensorType src0 = create_tensor<TensorType>(shape0, _data_type);
+        TensorType src1 = create_tensor<TensorType>(shape1, _data_type);
+        TensorType dst  = create_tensor<TensorType>(TensorShape::broadcast_shape(shape0, shape1), _data_type);
+
+        FunctionType logical_binary_op;
+
+        logical_binary_op.configure(&src0, &src1, &dst);
+
+        Parent::allocate_tensor({ &src0, &src1, &dst });
+
+        Parent::fill(AccessorType(src0), 0);
+        Parent::fill(AccessorType(src1), 1);
+
+        logical_binary_op.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape0, const TensorShape &shape1)
+    {
+        // Create reference
+        SimpleTensor<T> src0{ shape0, _data_type };
+        SimpleTensor<T> src1{ shape1, _data_type };
+
+        // Fill reference
+        Parent::fill(src0, 0);
+        Parent::fill(src1, 1);
+
+        return RefFunction(src0, src1);
+    }
+
+    static constexpr auto _data_type = DataType::U8;
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+using LogicalOrValidationFixture = LogicalBinaryOperationValidationFixture<TensorType, AccessorType, FunctionType, T, &reference::logical_or<T>>;
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+using LogicalAndValidationFixture = LogicalBinaryOperationValidationFixture<TensorType, AccessorType, FunctionType, T, &reference::logical_and<T>>;
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class LogicalNotValidationFixture : public LogicalOperationValidationFixtureBase<TensorType, AccessorType, FunctionType, T>
+{
+    using Parent = LogicalOperationValidationFixtureBase<TensorType, AccessorType, FunctionType, T>;
+
+public:
+    template <typename...>
+    void setup(TensorShape shape, DataType data_type)
+    {
+        Parent::_target    = compute_target(shape, data_type);
+        Parent::_reference = compute_reference(shape, data_type);
+    }
+
+private:
+    TensorType compute_target(const TensorShape &shape, DataType data_type)
+    {
+        TensorType src = create_tensor<TensorType>(shape, data_type);
+        TensorType dst = create_tensor<TensorType>(shape, data_type);
+
+        FunctionType logical_not;
+
+        logical_not.configure(&src, &dst);
+
+        Parent::allocate_tensor({ &src, &dst });
+
+        Parent::fill(AccessorType(src), 0);
+
+        logical_not.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src{ shape, data_type };
+
+        // Fill reference
+        Parent::fill(src, 0);
+
+        return reference::logical_not<T>(src);
+    }
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_LOGICAL_FIXTURE */
diff --git a/tests/validation/fixtures/ReduceMeanFixture.h b/tests/validation/fixtures/ReduceMeanFixture.h
index d10292182f..72887616fe 100644
--- a/tests/validation/fixtures/ReduceMeanFixture.h
+++ b/tests/validation/fixtures/ReduceMeanFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
@@ -47,10 +48,10 @@ class ReduceMeanValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info)
+    void setup(TensorShape shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info_input, QuantizationInfo quantization_info_output)
     {
-        _target    = compute_target(shape, data_type, axis, keep_dims, quantization_info);
-        _reference = compute_reference(shape, data_type, axis, keep_dims, quantization_info);
+        _target    = compute_target(shape, data_type, axis, keep_dims, quantization_info_input, quantization_info_output);
+        _reference = compute_reference(shape, data_type, axis, keep_dims, quantization_info_input, quantization_info_output);
     }
 
 protected:
@@ -71,11 +72,12 @@ class ReduceMeanValidationFixture : public framework::Fixture
         }
     }
 
-    TensorType compute_target(TensorShape &src_shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info)
+    TensorType compute_target(TensorShape &src_shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info_input, QuantizationInfo quantization_info_output)
     {
         // Create tensors
-        TensorType src = create_tensor<TensorType>(src_shape, data_type, 1, quantization_info);
-        TensorType dst;
+        TensorType  src       = create_tensor<TensorType>(src_shape, data_type, 1, quantization_info_input);
+        TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(src.info(), axis, keep_dims);
+        TensorType  dst       = create_tensor<TensorType>(dst_shape, data_type, 1, quantization_info_output);
 
         // Create and configure function
         FunctionType reduction_mean;
@@ -100,10 +102,10 @@ class ReduceMeanValidationFixture : public framework::Fixture
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(TensorShape &src_shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info)
+    SimpleTensor<T> compute_reference(TensorShape &src_shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info_input, QuantizationInfo quantization_info_output)
     {
         // Create reference
-        SimpleTensor<T> src{ src_shape, data_type, 1, quantization_info };
+        SimpleTensor<T> src{ src_shape, data_type, 1, quantization_info_input };
 
         // Fill reference
         fill(src);
@@ -113,7 +115,7 @@ class ReduceMeanValidationFixture : public framework::Fixture
         {
             TensorShape output_shape = i == 0 ? src_shape : out.shape();
             output_shape.set(axis[i], 1);
-            out = reference::reduction_operation<T, T>(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM);
+            out = reference::reduction_operation<T, T>(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM, quantization_info_output);
         }
 
         if(!keep_dims)
@@ -139,9 +141,9 @@ class ReduceMeanQuantizedFixture : public ReduceMeanValidationFixture<TensorType
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info = QuantizationInfo())
+    void setup(TensorShape shape, DataType data_type, Coordinates axis, bool keep_dims, QuantizationInfo quantization_info_input, QuantizationInfo quantization_info_output)
     {
-        ReduceMeanValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, keep_dims, quantization_info);
+        ReduceMeanValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, keep_dims, quantization_info_input, quantization_info_output);
     }
 };
 
@@ -152,7 +154,7 @@ class ReduceMeanFixture : public ReduceMeanValidationFixture<TensorType, Accesso
     template <typename...>
     void setup(TensorShape shape, DataType data_type, Coordinates axis, bool keep_dims)
     {
-        ReduceMeanValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, keep_dims, QuantizationInfo());
+        ReduceMeanValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, axis, keep_dims, QuantizationInfo(), QuantizationInfo());
     }
 };
 } // namespace validation
diff --git a/tests/validation/fixtures/ReductionOperationFixture.h b/tests/validation/fixtures/ReductionOperationFixture.h
index 3fb854454b..646518d2e8 100644
--- a/tests/validation/fixtures/ReductionOperationFixture.h
+++ b/tests/validation/fixtures/ReductionOperationFixture.h
@@ -126,7 +126,7 @@ class ReductionOperationValidationFixture : public framework::Fixture
         // Fill reference
         fill(src);
 
-        return reference::reduction_operation<T, T>(src, dst_shape, axis, op);
+        return reference::reduction_operation<T, T>(src, dst_shape, axis, op, quantization_info);
     }
 
     TensorType      _target{};
diff --git a/tests/validation/fixtures/ScaleFixture.h b/tests/validation/fixtures/ScaleFixture.h
index e2ed3ab6f9..1e66306715 100644
--- a/tests/validation/fixtures/ScaleFixture.h
+++ b/tests/validation/fixtures/ScaleFixture.h
@@ -137,7 +137,7 @@ class ScaleValidationGenericFixture : public framework::Fixture
         // Create and configure function
         FunctionType scale;
 
-        scale.configure(&src, &dst, ScaleKernelInfo{ _policy, _border_mode, _constant_border_value, _sampling_policy, /* use_padding */ true, _align_corners });
+        scale.configure(&src, &dst, ScaleKernelInfo{ _policy, _border_mode, _constant_border_value, _sampling_policy, /* use_padding */ false, _align_corners });
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
diff --git a/tests/validation/fixtures/SoftmaxLayerFixture.h b/tests/validation/fixtures/SoftmaxLayerFixture.h
index 29a3ed2cd0..30356d648d 100644
--- a/tests/validation/fixtures/SoftmaxLayerFixture.h
+++ b/tests/validation/fixtures/SoftmaxLayerFixture.h
@@ -32,7 +32,6 @@
 #include "tests/IAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
-#include "tests/validation/reference/LogSoftmaxLayer.h"
 #include "tests/validation/reference/SoftmaxLayer.h"
 
 #include <random>
@@ -52,8 +51,8 @@ class SoftmaxValidationGenericFixture : public framework::Fixture
     {
         _quantization_info = quantization_info;
 
-        _target    = compute_target(shape, data_type, quantization_info, beta, axis);
         _reference = compute_reference(shape, data_type, quantization_info, beta, axis);
+        _target    = compute_target(shape, data_type, quantization_info, beta, axis);
     }
 
 protected:
@@ -62,7 +61,7 @@ class SoftmaxValidationGenericFixture : public framework::Fixture
     {
         if(!is_data_type_quantized(tensor.data_type()))
         {
-            std::uniform_real_distribution<> distribution(-1000.f, 1000.f);
+            std::uniform_real_distribution<> distribution(-10.f, 10.f);
             library->fill(tensor, distribution, 0);
         }
         else // data type is quantized_asymmetric (signed or unsigned)
@@ -111,14 +110,7 @@ class SoftmaxValidationGenericFixture : public framework::Fixture
         // Fill reference
         fill(src);
 
-        if(IS_LOG)
-        {
-            return reference::log_softmax_layer<T>(src, beta, axis);
-        }
-        else
-        {
-            return reference::softmax_layer<T>(src, beta, axis);
-        }
+        return reference::softmax_layer<T>(src, beta, axis, IS_LOG);
     }
 
     TensorType       _target{};
@@ -155,6 +147,7 @@ class SoftmaxValidationQuantizedFixture : public SoftmaxValidationGenericFixture
                                                                                                   axis);
     }
 };
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h
index 03a2f5371d..34e27f499b 100644
--- a/tests/validation/reference/Convolution3d.h
+++ b/tests/validation/reference/Convolution3d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H
 #define ARM_COMPUTE_TEST_VALIDATION_CONVOLUTION_H
 
-#include "arm_compute/core/utils/misc/Requires.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "support/Requires.h"
 #include "tests/validation/Helpers.h"
 #include "tests/validation/reference/UtilsQuantizedAsymm.h"
 
diff --git a/tests/validation/reference/DepthConvertLayer.cpp b/tests/validation/reference/DepthConvertLayer.cpp
index 30b7e57a4a..94c719ade7 100644
--- a/tests/validation/reference/DepthConvertLayer.cpp
+++ b/tests/validation/reference/DepthConvertLayer.cpp
@@ -25,8 +25,8 @@
 
 #include "tests/validation/Helpers.h"
 
-#include "arm_compute/core/utils/misc/Rounding.h"
-#include "arm_compute/core/utils/misc/SaturateCast.h"
+#include "support/Rounding.h"
+#include "support/SaturateCast.h"
 
 #include "tests/Types.h"
 
diff --git a/tests/validation/reference/ElementwiseOperations.cpp b/tests/validation/reference/ElementwiseOperations.cpp
index aab9d9d00c..f22c84e153 100644
--- a/tests/validation/reference/ElementwiseOperations.cpp
+++ b/tests/validation/reference/ElementwiseOperations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,6 +74,15 @@ T arithm_op(ArithmeticOperation op, T src1, T src2, ConvertPolicy convert_policy
         case ArithmeticOperation::DIV:
         {
             val = (static_cast<intermediate_type>(src1) / static_cast<intermediate_type>(src2));
+            if(std::is_integral<T>::value)
+            {
+                // Implement flooring division
+                val = (src2 == 0) ? 0 : val;
+                if(static_cast<int32_t>(src1) % static_cast<int32_t>(src2) != 0 && ((src1 < 0) != (src2 < 0)))
+                {
+                    --val;
+                }
+            }
             break;
         }
         case ArithmeticOperation::POWER:
diff --git a/tests/validation/reference/LocallyConnected.cpp b/tests/validation/reference/LocallyConnected.cpp
deleted file mode 100644
index a5141f291f..0000000000
--- a/tests/validation/reference/LocallyConnected.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "LocallyConnected.h"
-
-#include "tests/validation/Helpers.h"
-#include "tests/validation/reference/Convolution3d.h"
-#include "tests/validation/reference/Utils.h"
-
-#include "tests/framework/Asserts.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename TB>
-SimpleTensor<T> locally_connected(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info)
-{
-    // Create reference
-    SimpleTensor<T> dst{ output_shape, src.data_type(), 1, src.quantization_info() };
-
-    // Compute reference
-    const int width_in  = src.shape().x();
-    const int height_in = src.shape().y();
-    const int depth_in  = src.shape().z();
-
-    const int width_out  = dst.shape().x();
-    const int height_out = dst.shape().y();
-    const int depth_out  = dst.shape().z();
-
-    const int width_weights  = weights.shape().x();
-    const int height_weights = weights.shape().y();
-    const int depth_weights  = weights.shape().z();
-
-    const int pad_left  = info.pad_left();
-    const int pad_top   = info.pad_top();
-    const int stride_xi = info.stride().first;
-    const int stride_yi = info.stride().second;
-
-    auto output_wh = scaled_dimensions(width_in, height_in, width_weights, height_weights, info);
-
-    const int start_xi    = width_weights / 2 - pad_left;
-    const int start_yi    = height_weights / 2 - pad_top;
-    const int end_xi      = output_wh.first * stride_xi;
-    const int end_yi      = output_wh.second * stride_yi;
-    const int num_batches = src.shape().total_size() / (width_in * height_in * depth_in);
-
-    for(int r = 0; r < num_batches; ++r)
-    {
-        int count = 0;
-        for(int yi = start_yi; yi < start_yi + end_yi; yi += stride_yi)
-        {
-            for(int xi = start_xi; xi < start_xi + end_xi; xi += stride_xi)
-            {
-                for(int ofm = 0; ofm < depth_out; ++ofm)
-                {
-                    // Compute input and output offsets
-                    const int offset_in  = r * width_in * height_in * depth_in;
-                    const int xo         = (xi - start_xi) / stride_xi;
-                    const int yo         = (yi - start_yi) / stride_yi;
-                    const int offset_out = xo + yo * width_out + ofm * width_out * height_out + r * width_out * height_out * depth_out;
-
-                    ARM_COMPUTE_ASSERT(xo < width_out);
-                    ARM_COMPUTE_ASSERT(yo < height_out);
-
-                    // Compute 3D convolution
-                    convolution_3d::detail::convolution3d(src, weights, bias, dst,
-                                                          offset_in, count * width_weights * height_weights * depth_weights, count, offset_out,
-                                                          xi, yi,
-                                                          width_in, height_in, depth_in,
-                                                          width_weights, height_weights);
-                    count++;
-                }
-            }
-        }
-    }
-
-    return dst;
-}
-
-// Locally Connected only supports F32
-template SimpleTensor<float> locally_connected(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
-                                               const PadStrideInfo &info);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/LogSoftmaxLayer.cpp b/tests/validation/reference/LogSoftmaxLayer.cpp
deleted file mode 100644
index 8d3b8f7579..0000000000
--- a/tests/validation/reference/LogSoftmaxLayer.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "LogSoftmaxLayer.h"
-#include "SoftmaxLayer.h"
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis)
-{
-    return softmax_layer_generic<T>(src, beta, reduce_end_axis, true);
-}
-
-template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type >
-SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis)
-{
-    const QuantizationInfo output_quantization_info = arm_compute::get_softmax_output_quantization_info(src.data_type(), true);
-
-    SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float> dst_tmp = log_softmax_layer<float>(src_tmp, beta, reduce_end_axis);
-    SimpleTensor<T>     dst     = convert_to_asymmetric<T>(dst_tmp, output_quantization_info);
-    return dst;
-}
-
-template SimpleTensor<float> log_softmax_layer(const SimpleTensor<float> &src, float beta, int32_t reduce_end_axis);
-template SimpleTensor<half> log_softmax_layer(const SimpleTensor<half> &src, float beta, int32_t reduce_end_axis);
-template SimpleTensor<uint8_t> log_softmax_layer(const SimpleTensor<uint8_t> &src, float beta, int32_t reduce_end_axis);
-template SimpleTensor<int8_t> log_softmax_layer(const SimpleTensor<int8_t> &src, float beta, int32_t reduce_end_axis);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/Logical.cpp b/tests/validation/reference/Logical.cpp
new file mode 100644
index 0000000000..9989ec841e
--- /dev/null
+++ b/tests/validation/reference/Logical.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "tests/validation/reference/Logical.h"
+#include "src/core/KernelTypes.h"
+#include "tests/framework/Asserts.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+T logical_binary_op(arm_compute::kernels::LogicalOperation op, T src1, T src2)
+{
+    switch(op)
+    {
+        case arm_compute::kernels::LogicalOperation::And:
+            return src1 && src2;
+        case arm_compute::kernels::LogicalOperation::Or:
+            return src1 || src2;
+        // The following operators are either invalid or not binary operator
+        case arm_compute::kernels::LogicalOperation::Not:
+        /* fall through */
+        case arm_compute::kernels::LogicalOperation::Unknown:
+        /* fall through */
+        default:
+            ARM_COMPUTE_ASSERT(true);
+    }
+    return T{};
+}
+
+template <size_t dim>
+struct BroadcastUnroll
+{
+    template <typename T>
+    static void unroll(arm_compute::kernels::LogicalOperation op, const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst,
+                       Coordinates &id_src1, Coordinates &id_src2, Coordinates &id_dst)
+    {
+        const bool src1_is_broadcast = (src1.shape()[dim - 1] != dst.shape()[dim - 1]);
+        const bool src2_is_broadcast = (src2.shape()[dim - 1] != dst.shape()[dim - 1]);
+
+        id_src1.set(dim - 1, 0);
+        id_src2.set(dim - 1, 0);
+        id_dst.set(dim - 1, 0);
+#if defined(_OPENMP)
+        #pragma omp parallel for
+#endif /* _OPENMP */
+        for(size_t i = 0; i < dst.shape()[dim - 1]; ++i)
+        {
+            BroadcastUnroll < dim - 1 >::unroll(op, src1, src2, dst, id_src1, id_src2, id_dst);
+
+            id_src1[dim - 1] += !src1_is_broadcast;
+            id_src2[dim - 1] += !src2_is_broadcast;
+            ++id_dst[dim - 1];
+        }
+    }
+};
+
+template <>
+struct BroadcastUnroll<0>
+{
+    template <typename T>
+    static void unroll(arm_compute::kernels::LogicalOperation op, const SimpleTensor<T> &src1, const SimpleTensor<T> &src2, SimpleTensor<T> &dst,
+                       Coordinates &id_src1, Coordinates &id_src2, Coordinates &id_dst)
+    {
+        dst[coord2index(dst.shape(), id_dst)] = logical_binary_op(op, src1[coord2index(src1.shape(), id_src1)], src2[coord2index(src2.shape(), id_src2)]);
+    }
+};
+
+template <typename T>
+SimpleTensor<T> logical_or(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2)
+{
+    Coordinates     id_src1{};
+    Coordinates     id_src2{};
+    Coordinates     id_dst{};
+    SimpleTensor<T> dst{ TensorShape::broadcast_shape(src1.shape(), src2.shape()), src1.data_type() };
+
+    BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(arm_compute::kernels::LogicalOperation::Or, src1, src2, dst, id_src1, id_src2, id_dst);
+
+    return dst;
+}
+
+template <typename T>
+SimpleTensor<T> logical_and(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2)
+{
+    Coordinates     id_src1{};
+    Coordinates     id_src2{};
+    Coordinates     id_dst{};
+    SimpleTensor<T> dst{ TensorShape::broadcast_shape(src1.shape(), src2.shape()), src1.data_type() };
+
+    BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(arm_compute::kernels::LogicalOperation::And, src1, src2, dst, id_src1, id_src2, id_dst);
+
+    return dst;
+}
+
+template <typename T>
+SimpleTensor<T> logical_not(const SimpleTensor<T> &src)
+{
+    SimpleTensor<T> dst(src.shape(), src.data_type());
+#if defined(_OPENMP)
+    #pragma omp parallel for
+#endif /* _OPENMP */
+    for(int i = 0; i < src.num_elements(); ++i)
+    {
+        dst[i] = !src[i];
+    }
+
+    return dst;
+}
+
+template SimpleTensor<uint8_t> logical_or(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2);
+template SimpleTensor<uint8_t> logical_and(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2);
+template SimpleTensor<uint8_t> logical_not(const SimpleTensor<uint8_t> &src1);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/LocallyConnected.h b/tests/validation/reference/Logical.h
similarity index 75%
rename from tests/validation/reference/LocallyConnected.h
rename to tests/validation/reference/Logical.h
index c85d0e9827..0d2bef9a43 100644
--- a/tests/validation/reference/LocallyConnected.h
+++ b/tests/validation/reference/Logical.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_LOCALLY_CONNECTED_H
-#define ARM_COMPUTE_TEST_LOCALLY_CONNECTED_H
+#ifndef ARM_COMPUTE_TEST_LOGICAL_H
+#define ARM_COMPUTE_TEST_LOGICAL_H
 
 #include "tests/SimpleTensor.h"
-#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -35,10 +34,14 @@ namespace validation
 {
 namespace reference
 {
-template <typename T, typename TB>
-SimpleTensor<T> locally_connected(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info);
+template <typename T>
+SimpleTensor<T> logical_or(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2);
+template <typename T>
+SimpleTensor<T> logical_and(const SimpleTensor<T> &src1, const SimpleTensor<T> &src2);
+template <typename T>
+SimpleTensor<T> logical_not(const SimpleTensor<T> &src1);
 } // namespace reference
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_LOCALLY_CONNECTED_H */
+#endif /* ARM_COMPUTE_TEST_LOGICAL_H */
diff --git a/tests/validation/reference/PixelWiseMultiplication.cpp b/tests/validation/reference/PixelWiseMultiplication.cpp
index 9f70b1c2af..0450991f61 100644
--- a/tests/validation/reference/PixelWiseMultiplication.cpp
+++ b/tests/validation/reference/PixelWiseMultiplication.cpp
@@ -43,6 +43,8 @@ struct is_floating_point
 
 namespace
 {
+constexpr float scale1_constant = 1.f;
+
 /** Compute the result of `src1 * src2 * scale`. The result type always matches the type of @p src2.
  *
  * @param[in] src1            An input value. Data types supported: U8/S16/F16/F32.
@@ -89,6 +91,90 @@ T3 mul(const T1 src1, const T2 src2, float scale, ConvertPolicy convert_policy,
     }
 }
 
+template <>
+int32_t mul(const int32_t src1, const int32_t src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+{
+    const int64_t intermediate_val = static_cast<int64_t>(src1) * static_cast<int64_t>(src2);
+
+    if(std::abs(scale - scale1_constant) < 0.00001f)
+    {
+        // Use bit-accurate integer arithmetic for scale == 1
+        // Apply conversion
+        if(convert_policy == ConvertPolicy::SATURATE)
+        {
+            return saturate_cast<int32_t>(intermediate_val);
+        }
+        else
+        {
+            // Correct wrapping behaviour for int32_t
+            const auto i32_hi              = static_cast<int64_t>(std::numeric_limits<int32_t>::max());
+            const auto i32_lo              = static_cast<int64_t>(std::numeric_limits<int32_t>::lowest());
+            const auto i32_wi              = static_cast<int64_t>(1) << 32;
+            int64_t    wrapped_rounded_val = intermediate_val - i32_wi * static_cast<int64_t>(support::cpp11::trunc(static_cast<double>(intermediate_val) / i32_wi));
+            if(wrapped_rounded_val <= i32_hi)
+            {
+                return static_cast<int32_t>(wrapped_rounded_val);
+            }
+            else
+            {
+                // Values beyond i32_hi wrap around to negatives
+                return static_cast<int32_t>((wrapped_rounded_val - i32_hi) + i32_lo - 1);
+            }
+        }
+    }
+    else
+    {
+        // Use double arithmetic for scale != 1; may not be bit-accurate
+        // Apply scaling
+        // scale == 1 / 2^scale_exponent
+        int scale_exponent = 0;
+        std::frexp(scale, &scale_exponent);
+        // Store the positive exponent. We know that we compute 1/2^n
+        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+        scale_exponent         = std::abs(scale_exponent - 1);
+        const double scale_inv = static_cast<int64_t>(1) << scale_exponent;
+        const double val       = intermediate_val / scale_inv;
+        // Apply rounding
+        double rounded_val = 0;
+        switch(rounding_policy)
+        {
+            case(RoundingPolicy::TO_ZERO):
+                rounded_val = support::cpp11::trunc(val);
+                break;
+            case(RoundingPolicy::TO_NEAREST_UP):
+                rounded_val = round_half_up(val);
+                break;
+            case(RoundingPolicy::TO_NEAREST_EVEN):
+                rounded_val = round_half_even(val);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported rounding policy");
+        }
+        // Apply conversion
+        if(convert_policy == ConvertPolicy::SATURATE)
+        {
+            return saturate_cast<int32_t>(rounded_val);
+        }
+        else
+        {
+            // Correct wrapping behaviour for int32_t
+            const auto i32_hi              = static_cast<double>(std::numeric_limits<int32_t>::max());
+            const auto i32_lo              = static_cast<double>(std::numeric_limits<int32_t>::lowest());
+            const auto i32_wi              = static_cast<double>(static_cast<int64_t>(1) << 32);
+            double     wrapped_rounded_val = rounded_val - i32_wi * std::floor(rounded_val / i32_wi);
+            if(wrapped_rounded_val <= i32_hi)
+            {
+                return static_cast<int32_t>(wrapped_rounded_val);
+            }
+            else
+            {
+                // Values beyond i32_hi wrap around to negatives
+                return static_cast<int32_t>((wrapped_rounded_val - i32_hi) + i32_lo - 1);
+            }
+        }
+    }
+}
+
 template <size_t dim>
 struct BroadcastUnroll
 {
@@ -264,6 +350,7 @@ SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<int16_t> &src
 // clang-format off
 template SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<uint8_t> &src1, const SimpleTensor<int16_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
 template SimpleTensor<int32_t> pixel_wise_multiplication(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
+template SimpleTensor<int32_t> pixel_wise_multiplication(const SimpleTensor<int32_t> &src1, const SimpleTensor<int32_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
 template SimpleTensor<float> pixel_wise_multiplication(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
 template SimpleTensor<half_float::half> pixel_wise_multiplication(const SimpleTensor<half_float::half> &src1, const SimpleTensor<half_float::half> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
 // clang-format on
diff --git a/tests/validation/reference/ROIAlignLayer.cpp b/tests/validation/reference/ROIAlignLayer.cpp
index b75415c6cb..2c176de407 100644
--- a/tests/validation/reference/ROIAlignLayer.cpp
+++ b/tests/validation/reference/ROIAlignLayer.cpp
@@ -40,21 +40,20 @@ namespace reference
 namespace
 {
 /** Average pooling over an aligned window */
-template <typename T>
-inline T roi_align_1x1(const T *input, TensorShape input_shape,
-                       float region_start_x,
-                       float bin_size_x,
-                       int   grid_size_x,
-                       float region_end_x,
-                       float region_start_y,
-                       float bin_size_y,
-                       int   grid_size_y,
-                       float region_end_y,
-                       int   pz)
+inline float roi_align_1x1(const float *input, TensorShape input_shape,
+                           float region_start_x,
+                           float bin_size_x,
+                           int   grid_size_x,
+                           float region_end_x,
+                           float region_start_y,
+                           float bin_size_y,
+                           int   grid_size_y,
+                           float region_end_y,
+                           int   pz)
 {
     if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
     {
-        return T(0);
+        return 0;
     }
     else
     {
@@ -85,16 +84,16 @@ inline T roi_align_1x1(const T *input, TensorShape input_shape,
                 const float w4 = ly * lx;
 
                 const size_t idx1  = coord2index(input_shape, Coordinates(x_low, y_low, pz));
-                T            data1 = input[idx1];
+                float        data1 = input[idx1];
 
                 const size_t idx2  = coord2index(input_shape, Coordinates(x_high, y_low, pz));
-                T            data2 = input[idx2];
+                float        data2 = input[idx2];
 
                 const size_t idx3  = coord2index(input_shape, Coordinates(x_low, y_high, pz));
-                T            data3 = input[idx3];
+                float        data3 = input[idx3];
 
                 const size_t idx4  = coord2index(input_shape, Coordinates(x_high, y_high, pz));
-                T            data4 = input[idx4];
+                float        data4 = input[idx4];
 
                 avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
             }
@@ -102,15 +101,22 @@ inline T roi_align_1x1(const T *input, TensorShape input_shape,
 
         avg /= grid_size_x * grid_size_y;
 
-        return T(avg);
+        return avg;
     }
 }
 
-/** Clamp the value between lower and upper */
-template <typename T>
-T clamp(T value, T lower, T upper)
+template <typename TI, typename TO>
+SimpleTensor<TO> float_converter(const SimpleTensor<TI> &tensor, DataType dst_dt)
 {
-    return std::max(lower, std::min(value, upper));
+    SimpleTensor<TO> dst{ tensor.shape(), dst_dt, 1, QuantizationInfo(), tensor.data_layout() };
+#if defined(_OPENMP)
+    #pragma omp parallel for
+#endif /* _OPENMP */
+    for(int i = 0; i < tensor.num_elements(); ++i)
+    {
+        dst[i] = tensor[i];
+    }
+    return dst;
 }
 
 SimpleTensor<float> convert_rois_from_asymmetric(SimpleTensor<uint16_t> rois)
@@ -129,8 +135,9 @@ SimpleTensor<float> convert_rois_from_asymmetric(SimpleTensor<uint16_t> rois)
     return dst;
 }
 } // namespace
-template <typename T, typename TRois>
-SimpleTensor<T> roi_align_layer(const SimpleTensor<T> &src, const SimpleTensor<TRois> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo)
+
+template <>
+SimpleTensor<float> roi_align_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo)
 {
     ARM_COMPUTE_UNUSED(output_qinfo);
 
@@ -138,11 +145,11 @@ SimpleTensor<T> roi_align_layer(const SimpleTensor<T> &src, const SimpleTensor<T
     const size_t num_rois       = rois.shape()[1];
     DataType     dst_data_type  = src.data_type();
 
-    const auto *rois_ptr = static_cast<const TRois *>(rois.data());
+    const auto *rois_ptr = static_cast<const float *>(rois.data());
 
-    TensorShape     input_shape = src.shape();
-    TensorShape     output_shape(pool_info.pooled_width(), pool_info.pooled_height(), src.shape()[2], num_rois);
-    SimpleTensor<T> dst(output_shape, dst_data_type);
+    TensorShape         input_shape = src.shape();
+    TensorShape         output_shape(pool_info.pooled_width(), pool_info.pooled_height(), src.shape()[2], num_rois);
+    SimpleTensor<float> dst(output_shape, dst_data_type);
 
     // Iterate over every pixel of the input image
     for(size_t px = 0; px < pool_info.pooled_width(); ++px)
@@ -169,10 +176,10 @@ SimpleTensor<T> roi_align_layer(const SimpleTensor<T> &src, const SimpleTensor<T
                 float region_end_x   = (px + 1) * bin_size_x + roi_anchor_x;
                 float region_end_y   = (py + 1) * bin_size_y + roi_anchor_y;
 
-                region_start_x = clamp(region_start_x, 0.0f, float(input_shape[0]));
-                region_start_y = clamp(region_start_y, 0.0f, float(input_shape[1]));
-                region_end_x   = clamp(region_end_x, 0.0f, float(input_shape[0]));
-                region_end_y   = clamp(region_end_y, 0.0f, float(input_shape[1]));
+                region_start_x = utility::clamp(region_start_x, 0.0f, float(input_shape[0]));
+                region_start_y = utility::clamp(region_start_y, 0.0f, float(input_shape[1]));
+                region_end_x   = utility::clamp(region_end_x, 0.0f, float(input_shape[0]));
+                region_end_y   = utility::clamp(region_end_y, 0.0f, float(input_shape[1]));
 
                 const int roi_bin_grid_x = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x));
                 const int roi_bin_grid_y = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y));
@@ -180,8 +187,8 @@ SimpleTensor<T> roi_align_layer(const SimpleTensor<T> &src, const SimpleTensor<T
                 // Move input and output pointer across the fourth dimension
                 const size_t input_stride_w  = input_shape[0] * input_shape[1] * input_shape[2];
                 const size_t output_stride_w = output_shape[0] * output_shape[1] * output_shape[2];
-                const T     *input_ptr       = src.data() + roi_batch * input_stride_w;
-                T           *output_ptr      = dst.data() + px + py * output_shape[0] + pw * output_stride_w;
+                const float *input_ptr       = src.data() + roi_batch * input_stride_w;
+                float       *output_ptr      = dst.data() + px + py * output_shape[0] + pw * output_stride_w;
 
                 for(int pz = 0; pz < int(input_shape[2]); ++pz)
                 {
@@ -202,8 +209,15 @@ SimpleTensor<T> roi_align_layer(const SimpleTensor<T> &src, const SimpleTensor<T
     return dst;
 }
 
-template SimpleTensor<float> roi_align_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo);
-template SimpleTensor<half> roi_align_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo);
+template <>
+SimpleTensor<half> roi_align_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo)
+{
+    SimpleTensor<float> src_tmp  = float_converter<half, float>(src, DataType::F32);
+    SimpleTensor<float> rois_tmp = float_converter<half, float>(rois, DataType::F32);
+    SimpleTensor<float> dst_tmp  = roi_align_layer<float, float>(src_tmp, rois_tmp, pool_info, output_qinfo);
+    SimpleTensor<half>  dst      = float_converter<float, half>(dst_tmp, DataType::F16);
+    return dst;
+}
 
 template <>
 SimpleTensor<uint8_t> roi_align_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint16_t> &rois, const ROIPoolingLayerInfo &pool_info, const QuantizationInfo &output_qinfo)
diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index 5bdd4f7e95..ffb79f86c5 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp
@@ -269,18 +269,19 @@ SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const T
 }
 
 template <typename T, typename OT>
-SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op)
+SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output)
 {
+    ARM_COMPUTE_UNUSED(quantization_info_output);
     return compute_reduction_operation<T, OT>(src, dst_shape, axis, op);
 }
 
 template <>
-SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op)
+SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output)
 {
     if(src.data_type() == DataType::QASYMM8)
     {
         // If the operation is MEAN_SUM, we can directly use the uint8 implementation without taking into account scale and offset
-        if(op == ReductionOperation::MEAN_SUM)
+        if(op == ReductionOperation::MEAN_SUM && src.quantization_info() == quantization_info_output)
         {
             return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op);
         }
@@ -288,7 +289,7 @@ SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, cons
         {
             SimpleTensor<float> src_f = convert_from_asymmetric(src);
             SimpleTensor<float> dst_f = reference::reduction_operation<float, float>(src_f, dst_shape, axis, op);
-            return convert_to_asymmetric<uint8_t>(dst_f, src.quantization_info());
+            return convert_to_asymmetric<uint8_t>(dst_f, quantization_info_output);
         }
     }
     else
@@ -298,12 +299,12 @@ SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, cons
 }
 
 template <>
-SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op)
+SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output)
 {
     if(src.data_type() == DataType::QASYMM8_SIGNED)
     {
         // If the operation is MEAN_SUM, we can directly use the int8 implementation without taking into account scale and offset
-        if(op == ReductionOperation::MEAN_SUM)
+        if(op == ReductionOperation::MEAN_SUM && src.quantization_info() == quantization_info_output)
         {
             return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op);
         }
@@ -311,7 +312,7 @@ SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const
         {
             SimpleTensor<float> src_f = convert_from_asymmetric(src);
             SimpleTensor<float> dst_f = reference::reduction_operation<float, float>(src_f, dst_shape, axis, op);
-            return convert_to_asymmetric<int8_t>(dst_f, src.quantization_info());
+            return convert_to_asymmetric<int8_t>(dst_f, quantization_info_output);
         }
     }
     else
@@ -320,14 +321,21 @@ SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const
     }
 }
 
-template SimpleTensor<float> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
-template SimpleTensor<half> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<float> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                                 QuantizationInfo quantization_info_output = QuantizationInfo());
+template SimpleTensor<half> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                                QuantizationInfo quantization_info_output = QuantizationInfo());
 
-template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
-template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int32_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
-template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
-template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
-template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int32_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
 
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/ReductionOperation.h b/tests/validation/reference/ReductionOperation.h
index 56d37e4f4d..9c9e721b29 100644
--- a/tests/validation/reference/ReductionOperation.h
+++ b/tests/validation/reference/ReductionOperation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,8 @@ namespace validation
 namespace reference
 {
 template <typename T, typename OT>
-SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op);
+SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
+                                     QuantizationInfo quantization_info_output = QuantizationInfo());
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Scale.cpp b/tests/validation/reference/Scale.cpp
index aa265c26c6..71e98fd776 100644
--- a/tests/validation/reference/Scale.cpp
+++ b/tests/validation/reference/Scale.cpp
@@ -25,9 +25,9 @@
 #include "Scale.h"
 
 #include "Utils.h"
-#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/core/utils/misc/Utility.h"
 #include "src/core/utils/ScaleUtils.h"
+#include "support/Rounding.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/reference/SliceOperations.cpp b/tests/validation/reference/SliceOperations.cpp
index 50c5c68882..222b1463dd 100644
--- a/tests/validation/reference/SliceOperations.cpp
+++ b/tests/validation/reference/SliceOperations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/tests/validation/reference/SoftmaxLayer.cpp b/tests/validation/reference/SoftmaxLayer.cpp
index 00206766f8..3fbac32a9b 100644
--- a/tests/validation/reference/SoftmaxLayer.cpp
+++ b/tests/validation/reference/SoftmaxLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "utils/TypePrinter.h"
 
 namespace arm_compute
 {
@@ -35,39 +36,43 @@ namespace validation
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis, bool is_log)
+SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log)
 {
     // Create reference
     SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
 
-    // Convert reduce-before axis (inclusive) to first n axes to reduce
-    const size_t first_n_reduce_axes = dim_index_2_num_dims(reduce_end_axis, static_cast<int32_t>(src.shape().num_dimensions()));
+    const int32_t n_dims = static_cast<int32_t>(src.shape().num_dimensions());
+    ARM_COMPUTE_ERROR_ON(axis < -n_dims || axis >= n_dims);
 
-    // Compute reference. Lower dims are the collapsing of the first axis
-    // dimensions (i.e., the flattened dimension of each batch). The upper dims are
-    // instead the batches we want to normalize
+    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, n_dims));
+    Window             window;
+    window.use_tensor_dimensions(src.shape());
+    const unsigned int axis_dimension = src.shape()[actual_axis];
+    window.set(actual_axis, Window::Dimension(0, 1, 1));
 
-    const int lower_dims = src.shape().total_size_lower(first_n_reduce_axes);
-
-    const int upper_dims = src.shape().total_size_upper(first_n_reduce_axes);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for
-#endif /* _OPENMP */
-    for(int r = 0; r < upper_dims; ++r)
+    execute_window_loop(window, [&](const Coordinates & id)
     {
-        const T *src_row_ptr = src.data() + r * lower_dims;
-        T       *dst_row_ptr = dst.data() + r * lower_dims;
-
-        // Find max
-        const T max = *std::max_element(src_row_ptr, src_row_ptr + lower_dims);
+        // Find max along axis
+        Coordinates offset(id);
+        offset.set(actual_axis, 0);
+        T max = *reinterpret_cast<const T *>(src(offset));
+        for(unsigned int axis_id = 1; axis_id < axis_dimension; ++axis_id)
+        {
+            offset.set(actual_axis, axis_id);
+            const T val = *reinterpret_cast<const T *>(src(offset));
+            if(val > max)
+            {
+                max = val;
+            }
+        }
 
         // Regularize
         T sum(0.f);
-        std::transform(src_row_ptr, src_row_ptr + lower_dims, dst_row_ptr, [&sum, max, beta, is_log](T val)
+        for(unsigned int axis_id = 0; axis_id < axis_dimension; ++axis_id)
         {
-            T res{ (val - max) *beta };
-
+            offset.set(actual_axis, axis_id);
+            const T val = *reinterpret_cast<const T *>(src(offset));
+            T       res{ (val - max) *beta };
             if(is_log)
             {
                 sum += std::exp(res);
@@ -77,50 +82,52 @@ SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, in
                 res = std::exp(res);
                 sum += res;
             }
-            return res;
-        });
+            *reinterpret_cast<T *>(dst(offset)) = res;
+        }
 
         // Normalize
-        std::transform(dst_row_ptr, dst_row_ptr + lower_dims, dst_row_ptr, [sum, is_log](T val)
+        for(unsigned int axis_id = 0; axis_id < axis_dimension; ++axis_id)
         {
+            offset.set(actual_axis, axis_id);
+            const T val = *reinterpret_cast<const T *>(dst(offset));
             if(is_log)
             {
-                return val - static_cast<T>(std::log(sum));
+                *reinterpret_cast<T *>(dst(offset)) = val - static_cast<T>(std::log(sum));
             }
             else
             {
-                return val / sum;
+                *reinterpret_cast<T *>(dst(offset)) = val / sum;
             }
-        });
-    }
-
+        }
+    });
     return dst;
 }
 
-template SimpleTensor<float> softmax_layer_generic(const SimpleTensor<float> &src, float beta, int32_t reduce_end_axis, bool is_log);
-template SimpleTensor<half> softmax_layer_generic(const SimpleTensor<half> &src, float beta, int32_t reduce_end_axis, bool is_log);
+template SimpleTensor<float> softmax_layer_generic(const SimpleTensor<float> &src, float beta, int32_t axis, bool is_log);
+template SimpleTensor<half> softmax_layer_generic(const SimpleTensor<half> &src, float beta, int32_t axis, bool is_log);
 
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis)
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log)
 {
-    return softmax_layer_generic<T>(src, beta, reduce_end_axis, false);
+    return softmax_layer_generic<T>(src, beta, axis, is_log);
 }
 
 template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type >
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis)
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log)
 {
-    const QuantizationInfo output_quantization_info = arm_compute::get_softmax_output_quantization_info(src.data_type(), false);
+    const QuantizationInfo output_quantization_info = arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log);
 
     SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta, reduce_end_axis);
+    SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta, axis, is_log);
     SimpleTensor<T>     dst     = convert_to_asymmetric<T>(dst_tmp, output_quantization_info);
     return dst;
 }
 
-template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src, float beta, int32_t reduce_end_axis);
-template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src, float beta, int32_t reduce_end_axis);
-template SimpleTensor<uint8_t> softmax_layer(const SimpleTensor<uint8_t> &src, float beta, int32_t reduce_end_axis);
-template SimpleTensor<int8_t> softmax_layer(const SimpleTensor<int8_t> &src, float beta, int32_t reduce_end_axis);
+template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src, float beta, int32_t axis, bool is_log);
+template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src, float beta, int32_t axis, bool is_log);
+template SimpleTensor<uint8_t> softmax_layer(const SimpleTensor<uint8_t> &src, float beta, int32_t axis, bool is_log);
+template SimpleTensor<int8_t> softmax_layer(const SimpleTensor<int8_t> &src, float beta, int32_t axis, bool is_log);
+
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/SoftmaxLayer.h b/tests/validation/reference/SoftmaxLayer.h
index 2af0b6d36a..3362f195c9 100644
--- a/tests/validation/reference/SoftmaxLayer.h
+++ b/tests/validation/reference/SoftmaxLayer.h
@@ -36,13 +36,13 @@ namespace validation
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis, bool is_log = false);
+SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log = false);
 
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis = 0);
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis = 0, bool is_log = false);
 
 template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type = 0 >
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis = 0);
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis = 0, bool is_log = false);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/UpsampleLayer.cpp b/tests/validation/reference/UpsampleLayer.cpp
index 4e06ad45b1..e5eb3760a7 100644
--- a/tests/validation/reference/UpsampleLayer.cpp
+++ b/tests/validation/reference/UpsampleLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "UpsampleLayer.h"
 
-#include "arm_compute/core/utils/misc/Requires.h"
+#include "support/Requires.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
diff --git a/utils/CommonGraphOptions.cpp b/utils/CommonGraphOptions.cpp
index bcfb865753..d262ea86e9 100644
--- a/utils/CommonGraphOptions.cpp
+++ b/utils/CommonGraphOptions.cpp
@@ -23,6 +23,7 @@
  */
 #include "CommonGraphOptions.h"
 
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/graph/TypeLoader.h"
 #include "arm_compute/graph/TypePrinter.h"
 
@@ -169,7 +170,11 @@ CommonGraphOptions::CommonGraphOptions(CommandLineParser &parser)
     data_layout->set_help("Data layout to use");
     enable_tuner->set_help("Enable OpenCL dynamic tuner");
     enable_cl_cache->set_help("Enable OpenCL program caches");
-    tuner_mode->set_help("Configures the time taken by the tuner to tune. Slow tuner produces the most performant LWS configuration");
+    tuner_mode->set_help(
+        "Configures the time taken by the tuner to tune. "
+        "Exhaustive: slowest but produces the most performant LWS configuration. "
+        "Normal: slow but produces the LWS configurations on par with Exhaustive most of the time. "
+        "Rapid: fast but produces less performant LWS configurations");
     fast_math_hint->set_help("Enable fast math");
     data_path->set_help("Path where graph parameters reside");
     image->set_help("Input image for the graph");
diff --git a/utils/CommonGraphOptions.h b/utils/CommonGraphOptions.h
index ab7125eac9..dac2e10b19 100644
--- a/utils/CommonGraphOptions.h
+++ b/utils/CommonGraphOptions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,6 +55,10 @@ namespace utils
  * --validation-range : The range of the images to validate from the validation file (e.g 0,9).
  *                      If not specified all the images will be validated.
  * --tuner-file       : The file to store the OpenCL dynamic tuner tuned parameters.
+ * --tuner-mode       : Select tuner mode. Supported modes: Exhaustive,Normal,Rapid
+ *                      * Exhaustive: slowest but produces the most performant LWS configuration.
+ *                      * Normal: slow but produces the LWS configurations on par with Exhaustive most of the time.
+ *                      * Rapid: fast but produces less performant LWS configurations
  *
  * Note that data, image and labels options should be provided to perform an inference run on an image.
  * Note that validation-file and validation-path should be provided to perform a graph accuracy estimation.
diff --git a/utils/GraphUtils.cpp b/utils/GraphUtils.cpp
index 84f04161a8..e543cabea9 100644
--- a/utils/GraphUtils.cpp
+++ b/utils/GraphUtils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -272,9 +272,16 @@ bool ImageAccessor::access_tensor(ITensor &tensor)
         {
             std::tie(permuted_shape, perm) = compute_permutation_parameters(tensor.info()->tensor_shape(), tensor.info()->data_layout());
         }
+
+#ifdef __arm__
         ARM_COMPUTE_EXIT_ON_MSG_VAR(image_loader->width() != permuted_shape.x() || image_loader->height() != permuted_shape.y(),
                                     "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu32 ",%" PRIu32 "].",
                                     image_loader->width(), image_loader->height(), permuted_shape.x(), permuted_shape.y());
+#else  // __arm__
+        ARM_COMPUTE_EXIT_ON_MSG_VAR(image_loader->width() != permuted_shape.x() || image_loader->height() != permuted_shape.y(),
+                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu64 ",%" PRIu64 "].",
+                                    image_loader->width(), image_loader->height(), permuted_shape.x(), permuted_shape.y());
+#endif // __arm__
 
         // Fill the tensor with the PPM content (BGR)
         image_loader->fill_planar_tensor(tensor, _bgr);
@@ -348,9 +355,16 @@ bool ValidationInputAccessor::access_tensor(arm_compute::ITensor &tensor)
             std::tie(permuted_shape, perm) = compute_permutation_parameters(tensor.info()->tensor_shape(),
                                                                             tensor.info()->data_layout());
         }
+
+#ifdef __arm__
         ARM_COMPUTE_EXIT_ON_MSG_VAR(jpeg.width() != permuted_shape.x() || jpeg.height() != permuted_shape.y(),
                                     "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu32 ",%" PRIu32 "].",
                                     jpeg.width(), jpeg.height(), permuted_shape.x(), permuted_shape.y());
+#else  // __arm__
+        ARM_COMPUTE_EXIT_ON_MSG_VAR(jpeg.width() != permuted_shape.x() || jpeg.height() != permuted_shape.y(),
+                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu64 ",%" PRIu64 "].",
+                                    jpeg.width(), jpeg.height(), permuted_shape.x(), permuted_shape.y());
+#endif // __arm__
 
         // Fill the tensor with the JPEG content (BGR)
         jpeg.fill_planar_tensor(tensor, _bgr);
diff --git a/utils/Utils.cpp b/utils/Utils.cpp
index 754e7d0734..7380ad7909 100644
--- a/utils/Utils.cpp
+++ b/utils/Utils.cpp
@@ -37,6 +37,12 @@
 #pragma GCC diagnostic ignored "-Wswitch-default"
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-overflow"
+#if (defined(__GNUC__) && (__GNUC__ >= 7))
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+#endif // (defined(__GNUC__) && (__GNUC__ >= 7))
+#if defined(__clang__)
+#pragma GCC diagnostic ignored "-Wparentheses-equality"
+#endif // defined(__clang__)
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb/stb_image.h"
 #pragma GCC diagnostic pop
diff --git a/utils/Utils.h b/utils/Utils.h
index c5db56d6f0..e44d978b24 100644
--- a/utils/Utils.h
+++ b/utils/Utils.h
@@ -24,10 +24,13 @@
 #ifndef __UTILS_UTILS_H__
 #define __UTILS_UTILS_H__
 
+/** @dir .
+ *  brief Boiler plate code used by examples. Various utilities to print types, load / store assets, etc.
+ */
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/Tensor.h"
 #pragma GCC diagnostic push